diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,248869 @@ +{ + "best_metric": 0.024749755859375, + "best_model_checkpoint": "./results_morgan/checkpoint-4000000", + "epoch": 0.0012, + "eval_steps": 20000, + "global_step": 4120000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.932819633999246e-05, + "loss": 2.5863, + "step": 100 + }, + { + "epoch": 0.0, + "learning_rate": 5.719504324825564e-05, + "loss": 1.4847, + "step": 200 + }, + { + "epoch": 0.0, + "learning_rate": 6.170868326030393e-05, + "loss": 1.2923, + "step": 300 + }, + { + "epoch": 0.0, + "learning_rate": 6.488740554563935e-05, + "loss": 1.186, + "step": 400 + }, + { + "epoch": 0.0, + "learning_rate": 6.734317372309117e-05, + "loss": 1.1051, + "step": 500 + }, + { + "epoch": 0.0, + "learning_rate": 6.934466112452983e-05, + "loss": 1.0482, + "step": 600 + }, + { + "epoch": 0.0, + "learning_rate": 7.103398676137137e-05, + "loss": 0.9834, + "step": 700 + }, + { + "epoch": 0.0, + "learning_rate": 7.24955125606774e-05, + "loss": 0.9314, + "step": 800 + }, + { + "epoch": 0.0, + "learning_rate": 7.378343796989793e-05, + "loss": 0.8974, + "step": 900 + }, + { + "epoch": 0.0, + "learning_rate": 7.493465960993282e-05, + "loss": 0.8582, + "step": 1000 + }, + { + "epoch": 0.0, + "learning_rate": 7.59754330499353e-05, + "loss": 0.8014, + "step": 1100 + }, + { + "epoch": 0.0, + "learning_rate": 7.692510816983375e-05, + "loss": 0.8006, + "step": 1200 + }, + { + "epoch": 0.0, + "learning_rate": 7.779835690831703e-05, + "loss": 0.7641, + "step": 1300 + }, + { + "epoch": 0.0, + "learning_rate": 7.860656934404977e-05, + "loss": 0.74, + "step": 1400 + }, + { + "epoch": 0.0, + "learning_rate": 7.93587649369845e-05, + "loss": 0.7245, + "step": 1500 + }, + { + "epoch": 0.0, + "learning_rate": 8.006220792650233e-05, + "loss": 0.7122, + "step": 1600 + }, + { + "epoch": 0.0, + "learning_rate": 8.07228351498672e-05, + "loss": 0.6768, + "step": 1700 + }, + { + "epoch": 0.0, + "learning_rate": 8.134556096770183e-05, + "loss": 0.656, + "step": 1800 + }, + { + "epoch": 0.0, + "learning_rate": 8.193449936668137e-05, + "loss": 0.6605, + "step": 1900 + }, + { + "epoch": 0.0, + "learning_rate": 8.249312884939091e-05, + "loss": 0.6269, + "step": 2000 + }, + { + "epoch": 0.0, + "learning_rate": 8.302441693357058e-05, + "loss": 0.6224, + "step": 2100 + }, + { + "epoch": 0.0, + "learning_rate": 8.353091558096729e-05, + "loss": 0.6004, + "step": 2200 + }, + { + "epoch": 0.0, + "learning_rate": 8.40148353391312e-05, + "loss": 0.579, + "step": 2300 + }, + { + "epoch": 0.0, + "learning_rate": 8.447810365175978e-05, + "loss": 0.5846, + "step": 2400 + }, + { + "epoch": 0.0, + "learning_rate": 8.49224112285631e-05, + "loss": 0.5704, + "step": 2500 + }, + { + "epoch": 0.0, + "learning_rate": 8.534924929370153e-05, + "loss": 0.5632, + "step": 2600 + }, + { + "epoch": 0.0, + "learning_rate": 8.575590883528774e-05, + "loss": 0.5384, + "step": 2700 + }, + { + "epoch": 0.0, + "learning_rate": 8.615177339252512e-05, + "loss": 0.5403, + "step": 2800 + }, + { + "epoch": 0.0, + "learning_rate": 8.653371084119956e-05, + "loss": 0.5251, + "step": 2900 + }, + { + "epoch": 0.0, + "learning_rate": 8.690266792100479e-05, + "loss": 0.5205, + "step": 3000 + }, + { + "epoch": 0.0, + "learning_rate": 8.725949800007947e-05, + "loss": 0.5042, + "step": 3100 + }, + { + "epoch": 0.0, + "learning_rate": 8.760497296348612e-05, + "loss": 0.5014, + "step": 3200 + }, + { + "epoch": 0.0, + "learning_rate": 8.793979326779768e-05, + "loss": 0.4824, + "step": 3300 + }, + { + "epoch": 0.0, + "learning_rate": 8.826459649112794e-05, + "loss": 0.4899, + "step": 3400 + }, + { + "epoch": 0.0, + "learning_rate": 8.857996464094115e-05, + "loss": 0.479, + "step": 3500 + }, + { + "epoch": 0.0, + "learning_rate": 8.888643043011622e-05, + "loss": 0.472, + "step": 3600 + }, + { + "epoch": 0.0, + "learning_rate": 8.918448269127446e-05, + "loss": 0.4624, + "step": 3700 + }, + { + "epoch": 0.0, + "learning_rate": 8.947457106756977e-05, + "loss": 0.4584, + "step": 3800 + }, + { + "epoch": 0.0, + "learning_rate": 8.975711009295404e-05, + "loss": 0.4585, + "step": 3900 + }, + { + "epoch": 0.0, + "learning_rate": 9.002704319460056e-05, + "loss": 0.4445, + "step": 4000 + }, + { + "epoch": 0.0, + "learning_rate": 9.029573698745933e-05, + "loss": 0.4458, + "step": 4100 + }, + { + "epoch": 0.0, + "learning_rate": 9.055794152084609e-05, + "loss": 0.4392, + "step": 4200 + }, + { + "epoch": 0.0, + "learning_rate": 9.081396286331679e-05, + "loss": 0.4284, + "step": 4300 + }, + { + "epoch": 0.0, + "learning_rate": 9.106408592760968e-05, + "loss": 0.4272, + "step": 4400 + }, + { + "epoch": 0.0, + "learning_rate": 9.130857637656785e-05, + "loss": 0.4085, + "step": 4500 + }, + { + "epoch": 0.0, + "learning_rate": 9.154768231915052e-05, + "loss": 0.4034, + "step": 4600 + }, + { + "epoch": 0.0, + "learning_rate": 9.178163582367895e-05, + "loss": 0.4052, + "step": 4700 + }, + { + "epoch": 0.0, + "learning_rate": 9.201065427145362e-05, + "loss": 0.4068, + "step": 4800 + }, + { + "epoch": 0.0, + "learning_rate": 9.223494157053206e-05, + "loss": 0.4046, + "step": 4900 + }, + { + "epoch": 0.0, + "learning_rate": 9.245468924665303e-05, + "loss": 0.3919, + "step": 5000 + }, + { + "epoch": 0.0, + "learning_rate": 9.267007742593345e-05, + "loss": 0.3837, + "step": 5100 + }, + { + "epoch": 0.0, + "learning_rate": 9.288127572197122e-05, + "loss": 0.3848, + "step": 5200 + }, + { + "epoch": 0.0, + "learning_rate": 9.308844403830141e-05, + "loss": 0.3861, + "step": 5300 + }, + { + "epoch": 0.0, + "learning_rate": 9.329173329571588e-05, + "loss": 0.3732, + "step": 5400 + }, + { + "epoch": 0.0, + "learning_rate": 9.348930861125227e-05, + "loss": 0.3717, + "step": 5500 + }, + { + "epoch": 0.0, + "learning_rate": 9.368529519716057e-05, + "loss": 0.3737, + "step": 5600 + }, + { + "epoch": 0.0, + "learning_rate": 9.387780665987676e-05, + "loss": 0.3573, + "step": 5700 + }, + { + "epoch": 0.0, + "learning_rate": 9.40669640931859e-05, + "loss": 0.3696, + "step": 5800 + }, + { + "epoch": 0.0, + "learning_rate": 9.425288236967753e-05, + "loss": 0.3638, + "step": 5900 + }, + { + "epoch": 0.0, + "learning_rate": 9.443567055973278e-05, + "loss": 0.3572, + "step": 6000 + }, + { + "epoch": 0.0, + "learning_rate": 9.461543231582187e-05, + "loss": 0.3521, + "step": 6100 + }, + { + "epoch": 0.0, + "learning_rate": 9.479226622550294e-05, + "loss": 0.3458, + "step": 6200 + }, + { + "epoch": 0.0, + "learning_rate": 9.496626613613171e-05, + "loss": 0.3307, + "step": 6300 + }, + { + "epoch": 0.0, + "learning_rate": 9.513752145396e-05, + "loss": 0.3307, + "step": 6400 + }, + { + "epoch": 0.0, + "learning_rate": 9.530611742000922e-05, + "loss": 0.3446, + "step": 6500 + }, + { + "epoch": 0.0, + "learning_rate": 9.547213536485023e-05, + "loss": 0.3377, + "step": 6600 + }, + { + "epoch": 0.0, + "learning_rate": 9.563565294419558e-05, + "loss": 0.3245, + "step": 6700 + }, + { + "epoch": 0.0, + "learning_rate": 9.579674435701253e-05, + "loss": 0.3279, + "step": 6800 + }, + { + "epoch": 0.0, + "learning_rate": 9.595548054769064e-05, + "loss": 0.3227, + "step": 6900 + }, + { + "epoch": 0.0, + "learning_rate": 9.611192939364202e-05, + "loss": 0.3319, + "step": 7000 + }, + { + "epoch": 0.0, + "learning_rate": 9.626615587957666e-05, + "loss": 0.3167, + "step": 7100 + }, + { + "epoch": 0.0, + "learning_rate": 9.641822225957206e-05, + "loss": 0.3196, + "step": 7200 + }, + { + "epoch": 0.0, + "learning_rate": 9.656818820794935e-05, + "loss": 0.313, + "step": 7300 + }, + { + "epoch": 0.0, + "learning_rate": 9.671611095987065e-05, + "loss": 0.3142, + "step": 7400 + }, + { + "epoch": 0.0, + "learning_rate": 9.685769582820096e-05, + "loss": 0.3081, + "step": 7500 + }, + { + "epoch": 0.0, + "learning_rate": 9.700175210160166e-05, + "loss": 0.3183, + "step": 7600 + }, + { + "epoch": 0.0, + "learning_rate": 9.714392202673169e-05, + "loss": 0.3224, + "step": 7700 + }, + { + "epoch": 0.0, + "learning_rate": 9.72842543674037e-05, + "loss": 0.3041, + "step": 7800 + }, + { + "epoch": 0.0, + "learning_rate": 9.742279602065062e-05, + "loss": 0.2954, + "step": 7900 + }, + { + "epoch": 0.0, + "learning_rate": 9.755959211081178e-05, + "loss": 0.3045, + "step": 8000 + }, + { + "epoch": 0.0, + "learning_rate": 9.769468607776539e-05, + "loss": 0.2967, + "step": 8100 + }, + { + "epoch": 0.0, + "learning_rate": 9.782811975973896e-05, + "loss": 0.3057, + "step": 8200 + }, + { + "epoch": 0.0, + "learning_rate": 9.795993347109249e-05, + "loss": 0.2927, + "step": 8300 + }, + { + "epoch": 0.0, + "learning_rate": 9.809016607543647e-05, + "loss": 0.3007, + "step": 8400 + }, + { + "epoch": 0.0, + "learning_rate": 9.821885505441631e-05, + "loss": 0.2893, + "step": 8500 + }, + { + "epoch": 0.0, + "learning_rate": 9.834477210358549e-05, + "loss": 0.2889, + "step": 8600 + }, + { + "epoch": 0.0, + "learning_rate": 9.847049562565526e-05, + "loss": 0.2926, + "step": 8700 + }, + { + "epoch": 0.0, + "learning_rate": 9.859477997323254e-05, + "loss": 0.2776, + "step": 8800 + }, + { + "epoch": 0.0, + "learning_rate": 9.871765772262129e-05, + "loss": 0.2688, + "step": 8900 + }, + { + "epoch": 0.0, + "learning_rate": 9.883795203612481e-05, + "loss": 0.2951, + "step": 9000 + }, + { + "epoch": 0.0, + "learning_rate": 9.895812329110132e-05, + "loss": 0.2865, + "step": 9100 + }, + { + "epoch": 0.0, + "learning_rate": 9.907697901604568e-05, + "loss": 0.2817, + "step": 9200 + }, + { + "epoch": 0.0, + "learning_rate": 9.919454770189497e-05, + "loss": 0.2769, + "step": 9300 + }, + { + "epoch": 0.0, + "learning_rate": 9.931085692393412e-05, + "loss": 0.2857, + "step": 9400 + }, + { + "epoch": 0.0, + "learning_rate": 9.942593338061702e-05, + "loss": 0.2677, + "step": 9500 + }, + { + "epoch": 0.0, + "learning_rate": 9.953980293035202e-05, + "loss": 0.2693, + "step": 9600 + }, + { + "epoch": 0.0, + "learning_rate": 9.965249062637823e-05, + "loss": 0.2803, + "step": 9700 + }, + { + "epoch": 0.0, + "learning_rate": 9.976402074985049e-05, + "loss": 0.2663, + "step": 9800 + }, + { + "epoch": 0.0, + "learning_rate": 9.987441684124227e-05, + "loss": 0.2725, + "step": 9900 + }, + { + "epoch": 0.0, + "learning_rate": 9.998370173016803e-05, + "loss": 0.2639, + "step": 10000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2637, + "step": 10100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2627, + "step": 10200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2681, + "step": 10300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2696, + "step": 10400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.263, + "step": 10500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2637, + "step": 10600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2607, + "step": 10700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2502, + "step": 10800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.252, + "step": 10900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2483, + "step": 11000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2449, + "step": 11100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2397, + "step": 11200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2481, + "step": 11300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2459, + "step": 11400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2437, + "step": 11500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2456, + "step": 11600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2518, + "step": 11700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2371, + "step": 11800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2379, + "step": 11900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2374, + "step": 12000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2409, + "step": 12100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2401, + "step": 12200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2376, + "step": 12300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2414, + "step": 12400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2348, + "step": 12500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2314, + "step": 12600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2223, + "step": 12700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2256, + "step": 12800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2287, + "step": 12900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2217, + "step": 13000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2317, + "step": 13100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2208, + "step": 13200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2282, + "step": 13300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2182, + "step": 13400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2171, + "step": 13500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2223, + "step": 13600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.217, + "step": 13700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2124, + "step": 13800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2205, + "step": 13900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2123, + "step": 14000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2186, + "step": 14100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2087, + "step": 14200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2165, + "step": 14300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2119, + "step": 14400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2051, + "step": 14500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2081, + "step": 14600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1978, + "step": 14700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.211, + "step": 14800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2126, + "step": 14900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1937, + "step": 15000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2059, + "step": 15100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2036, + "step": 15200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2041, + "step": 15300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2034, + "step": 15400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2027, + "step": 15500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2017, + "step": 15600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2008, + "step": 15700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1966, + "step": 15800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2037, + "step": 15900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2068, + "step": 16000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1993, + "step": 16100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.201, + "step": 16200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.2006, + "step": 16300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1987, + "step": 16400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1846, + "step": 16500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1887, + "step": 16600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1891, + "step": 16700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1941, + "step": 16800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1862, + "step": 16900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.192, + "step": 17000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1948, + "step": 17100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1971, + "step": 17200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1902, + "step": 17300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1848, + "step": 17400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1934, + "step": 17500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1867, + "step": 17600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1817, + "step": 17700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1854, + "step": 17800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1892, + "step": 17900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1913, + "step": 18000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1831, + "step": 18100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1882, + "step": 18200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1875, + "step": 18300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1892, + "step": 18400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1781, + "step": 18500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1839, + "step": 18600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1782, + "step": 18700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1799, + "step": 18800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1808, + "step": 18900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1859, + "step": 19000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.186, + "step": 19100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1852, + "step": 19200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1773, + "step": 19300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1695, + "step": 19400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1787, + "step": 19500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1699, + "step": 19600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1724, + "step": 19700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1739, + "step": 19800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1792, + "step": 19900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1771, + "step": 20000 + }, + { + "epoch": 0.0, + "eval_loss": 0.1444091796875, + "eval_runtime": 3165.6765, + "eval_samples_per_second": 355.287, + "eval_steps_per_second": 22.206, + "step": 20000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1747, + "step": 20100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1747, + "step": 20200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1726, + "step": 20300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1649, + "step": 20400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1669, + "step": 20500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1687, + "step": 20600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1764, + "step": 20700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1673, + "step": 20800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1702, + "step": 20900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1619, + "step": 21000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1684, + "step": 21100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.171, + "step": 21200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1681, + "step": 21300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1692, + "step": 21400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1702, + "step": 21500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1589, + "step": 21600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1663, + "step": 21700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1596, + "step": 21800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1571, + "step": 21900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1688, + "step": 22000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1648, + "step": 22100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1635, + "step": 22200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1634, + "step": 22300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.165, + "step": 22400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.165, + "step": 22500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1643, + "step": 22600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1677, + "step": 22700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1597, + "step": 22800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1576, + "step": 22900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1637, + "step": 23000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1579, + "step": 23100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1627, + "step": 23200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1542, + "step": 23300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1561, + "step": 23400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1592, + "step": 23500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1492, + "step": 23600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1515, + "step": 23700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1545, + "step": 23800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1608, + "step": 23900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1602, + "step": 24000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1487, + "step": 24100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1565, + "step": 24200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.15, + "step": 24300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.162, + "step": 24400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1586, + "step": 24500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1559, + "step": 24600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.167, + "step": 24700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1494, + "step": 24800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1534, + "step": 24900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1444, + "step": 25000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1579, + "step": 25100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1505, + "step": 25200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1455, + "step": 25300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1544, + "step": 25400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1466, + "step": 25500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.149, + "step": 25600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1454, + "step": 25700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1508, + "step": 25800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1475, + "step": 25900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.155, + "step": 26000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1457, + "step": 26100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1467, + "step": 26200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1421, + "step": 26300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1483, + "step": 26400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1441, + "step": 26500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1476, + "step": 26600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.151, + "step": 26700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1435, + "step": 26800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1469, + "step": 26900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1454, + "step": 27000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1371, + "step": 27100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1417, + "step": 27200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.151, + "step": 27300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1419, + "step": 27400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1366, + "step": 27500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1487, + "step": 27600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1434, + "step": 27700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1338, + "step": 27800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1492, + "step": 27900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1533, + "step": 28000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1468, + "step": 28100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1529, + "step": 28200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1463, + "step": 28300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1479, + "step": 28400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1377, + "step": 28500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1405, + "step": 28600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1449, + "step": 28700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1399, + "step": 28800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1438, + "step": 28900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.143, + "step": 29000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1349, + "step": 29100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1437, + "step": 29200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1401, + "step": 29300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1449, + "step": 29400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.131, + "step": 29500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1344, + "step": 29600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1379, + "step": 29700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1379, + "step": 29800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1343, + "step": 29900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1355, + "step": 30000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1357, + "step": 30100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1291, + "step": 30200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1401, + "step": 30300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1354, + "step": 30400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1371, + "step": 30500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1386, + "step": 30600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1348, + "step": 30700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1303, + "step": 30800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.134, + "step": 30900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.137, + "step": 31000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.133, + "step": 31100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1312, + "step": 31200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1303, + "step": 31300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1328, + "step": 31400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1362, + "step": 31500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1373, + "step": 31600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1269, + "step": 31700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1395, + "step": 31800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1364, + "step": 31900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.125, + "step": 32000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1342, + "step": 32100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1323, + "step": 32200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.129, + "step": 32300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1322, + "step": 32400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.131, + "step": 32500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.135, + "step": 32600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.124, + "step": 32700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1257, + "step": 32800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1314, + "step": 32900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1275, + "step": 33000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1301, + "step": 33100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1284, + "step": 33200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1288, + "step": 33300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1258, + "step": 33400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1275, + "step": 33500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1294, + "step": 33600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.121, + "step": 33700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1321, + "step": 33800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1225, + "step": 33900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1257, + "step": 34000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1278, + "step": 34100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1237, + "step": 34200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1244, + "step": 34300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1258, + "step": 34400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1212, + "step": 34500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1205, + "step": 34600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1255, + "step": 34700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1234, + "step": 34800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1286, + "step": 34900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1259, + "step": 35000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1211, + "step": 35100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.123, + "step": 35200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1221, + "step": 35300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1294, + "step": 35400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1186, + "step": 35500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1221, + "step": 35600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1276, + "step": 35700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1254, + "step": 35800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.123, + "step": 35900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1256, + "step": 36000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.122, + "step": 36100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1269, + "step": 36200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1195, + "step": 36300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1252, + "step": 36400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1204, + "step": 36500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1179, + "step": 36600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1239, + "step": 36700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1209, + "step": 36800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1193, + "step": 36900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1213, + "step": 37000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1192, + "step": 37100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1263, + "step": 37200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1199, + "step": 37300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.123, + "step": 37400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1162, + "step": 37500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1231, + "step": 37600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1166, + "step": 37700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1236, + "step": 37800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1215, + "step": 37900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1256, + "step": 38000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1201, + "step": 38100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1221, + "step": 38200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.118, + "step": 38300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1166, + "step": 38400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1133, + "step": 38500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1276, + "step": 38600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1208, + "step": 38700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.113, + "step": 38800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1182, + "step": 38900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1155, + "step": 39000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1215, + "step": 39100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1215, + "step": 39200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1167, + "step": 39300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1275, + "step": 39400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.114, + "step": 39500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.12, + "step": 39600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1173, + "step": 39700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1327, + "step": 39800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1156, + "step": 39900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1122, + "step": 40000 + }, + { + "epoch": 0.0, + "eval_loss": 0.09625244140625, + "eval_runtime": 3128.3661, + "eval_samples_per_second": 359.524, + "eval_steps_per_second": 22.471, + "step": 40000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1227, + "step": 40100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.116, + "step": 40200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1083, + "step": 40300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1189, + "step": 40400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1126, + "step": 40500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1162, + "step": 40600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1152, + "step": 40700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1142, + "step": 40800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1118, + "step": 40900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1138, + "step": 41000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1139, + "step": 41100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1173, + "step": 41200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1101, + "step": 41300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1185, + "step": 41400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1135, + "step": 41500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1141, + "step": 41600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1159, + "step": 41700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1084, + "step": 41800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.113, + "step": 41900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1142, + "step": 42000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1116, + "step": 42100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1209, + "step": 42200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1188, + "step": 42300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1173, + "step": 42400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1121, + "step": 42500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1077, + "step": 42600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1088, + "step": 42700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1125, + "step": 42800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1067, + "step": 42900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.11, + "step": 43000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1124, + "step": 43100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1125, + "step": 43200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1093, + "step": 43300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.115, + "step": 43400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.111, + "step": 43500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1099, + "step": 43600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1125, + "step": 43700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.112, + "step": 43800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.104, + "step": 43900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1082, + "step": 44000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1118, + "step": 44100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1046, + "step": 44200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1124, + "step": 44300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1116, + "step": 44400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1079, + "step": 44500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1048, + "step": 44600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1104, + "step": 44700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1105, + "step": 44800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1136, + "step": 44900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1049, + "step": 45000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1099, + "step": 45100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1074, + "step": 45200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1102, + "step": 45300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1092, + "step": 45400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1077, + "step": 45500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1082, + "step": 45600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1012, + "step": 45700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1106, + "step": 45800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1107, + "step": 45900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1124, + "step": 46000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1043, + "step": 46100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1054, + "step": 46200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 46300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1011, + "step": 46400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1116, + "step": 46500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1089, + "step": 46600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1078, + "step": 46700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1094, + "step": 46800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1072, + "step": 46900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.105, + "step": 47000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1075, + "step": 47100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1046, + "step": 47200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1093, + "step": 47300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1102, + "step": 47400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1052, + "step": 47500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1066, + "step": 47600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1008, + "step": 47700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1058, + "step": 47800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1065, + "step": 47900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1042, + "step": 48000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1064, + "step": 48100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1027, + "step": 48200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1051, + "step": 48300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.106, + "step": 48400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.102, + "step": 48500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1063, + "step": 48600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1024, + "step": 48700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1065, + "step": 48800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1083, + "step": 48900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1072, + "step": 49000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1064, + "step": 49100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1048, + "step": 49200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1008, + "step": 49300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.106, + "step": 49400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0995, + "step": 49500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1022, + "step": 49600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1019, + "step": 49700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1029, + "step": 49800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1014, + "step": 49900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1032, + "step": 50000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0982, + "step": 50100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1027, + "step": 50200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1084, + "step": 50300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1008, + "step": 50400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1026, + "step": 50500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1018, + "step": 50600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0991, + "step": 50700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1074, + "step": 50800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1076, + "step": 50900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1056, + "step": 51000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.099, + "step": 51100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1013, + "step": 51200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0939, + "step": 51300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1038, + "step": 51400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0967, + "step": 51500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1051, + "step": 51600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1023, + "step": 51700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1034, + "step": 51800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.103, + "step": 51900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1015, + "step": 52000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1018, + "step": 52100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0962, + "step": 52200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.103, + "step": 52300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0988, + "step": 52400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1014, + "step": 52500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0948, + "step": 52600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0998, + "step": 52700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1071, + "step": 52800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1007, + "step": 52900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1041, + "step": 53000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0985, + "step": 53100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1001, + "step": 53200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1007, + "step": 53300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1011, + "step": 53400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0966, + "step": 53500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1024, + "step": 53600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.102, + "step": 53700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1035, + "step": 53800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0976, + "step": 53900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1008, + "step": 54000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0953, + "step": 54100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0936, + "step": 54200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0994, + "step": 54300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0998, + "step": 54400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.097, + "step": 54500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1021, + "step": 54600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0985, + "step": 54700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1005, + "step": 54800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1027, + "step": 54900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.097, + "step": 55000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0953, + "step": 55100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0949, + "step": 55200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1013, + "step": 55300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0994, + "step": 55400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0999, + "step": 55500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0965, + "step": 55600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1009, + "step": 55700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0952, + "step": 55800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0986, + "step": 55900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0942, + "step": 56000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.102, + "step": 56100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1031, + "step": 56200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.099, + "step": 56300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0951, + "step": 56400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.097, + "step": 56500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1002, + "step": 56600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0953, + "step": 56700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1002, + "step": 56800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0936, + "step": 56900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0987, + "step": 57000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.093, + "step": 57100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0901, + "step": 57200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1005, + "step": 57300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0977, + "step": 57400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0982, + "step": 57500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0979, + "step": 57600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0974, + "step": 57700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0979, + "step": 57800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0922, + "step": 57900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0996, + "step": 58000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0926, + "step": 58100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.095, + "step": 58200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1011, + "step": 58300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0937, + "step": 58400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1016, + "step": 58500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1028, + "step": 58600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0952, + "step": 58700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0958, + "step": 58800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0976, + "step": 58900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0959, + "step": 59000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0931, + "step": 59100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0934, + "step": 59200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0995, + "step": 59300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0909, + "step": 59400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0941, + "step": 59500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.098, + "step": 59600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0958, + "step": 59700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0927, + "step": 59800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0954, + "step": 59900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0928, + "step": 60000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0804443359375, + "eval_runtime": 3083.0678, + "eval_samples_per_second": 364.806, + "eval_steps_per_second": 22.801, + "step": 60000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0972, + "step": 60100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0896, + "step": 60200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0929, + "step": 60300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0896, + "step": 60400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0945, + "step": 60500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1001, + "step": 60600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0927, + "step": 60700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0905, + "step": 60800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0933, + "step": 60900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0923, + "step": 61000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0931, + "step": 61100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0942, + "step": 61200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.092, + "step": 61300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0898, + "step": 61400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0899, + "step": 61500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0901, + "step": 61600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.095, + "step": 61700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0947, + "step": 61800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0957, + "step": 61900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0982, + "step": 62000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0922, + "step": 62100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0899, + "step": 62200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0953, + "step": 62300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0973, + "step": 62400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0908, + "step": 62500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0919, + "step": 62600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0958, + "step": 62700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0945, + "step": 62800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0951, + "step": 62900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0951, + "step": 63000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0874, + "step": 63100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0903, + "step": 63200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0898, + "step": 63300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0939, + "step": 63400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0931, + "step": 63500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0896, + "step": 63600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0945, + "step": 63700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0903, + "step": 63800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0869, + "step": 63900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0886, + "step": 64000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0927, + "step": 64100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0986, + "step": 64200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0952, + "step": 64300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0915, + "step": 64400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0871, + "step": 64500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0879, + "step": 64600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0926, + "step": 64700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0889, + "step": 64800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0892, + "step": 64900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0961, + "step": 65000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0918, + "step": 65100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0921, + "step": 65200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0894, + "step": 65300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.088, + "step": 65400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0918, + "step": 65500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0907, + "step": 65600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0932, + "step": 65700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0855, + "step": 65800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0895, + "step": 65900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0881, + "step": 66000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0916, + "step": 66100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0896, + "step": 66200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0916, + "step": 66300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0878, + "step": 66400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0891, + "step": 66500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0911, + "step": 66600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0932, + "step": 66700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0892, + "step": 66800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0886, + "step": 66900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0879, + "step": 67000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.093, + "step": 67100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0939, + "step": 67200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0993, + "step": 67300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0933, + "step": 67400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0907, + "step": 67500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0834, + "step": 67600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0865, + "step": 67700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0905, + "step": 67800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0839, + "step": 67900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0878, + "step": 68000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0879, + "step": 68100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.09, + "step": 68200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.086, + "step": 68300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0871, + "step": 68400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0855, + "step": 68500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0887, + "step": 68600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0844, + "step": 68700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0856, + "step": 68800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0848, + "step": 68900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0867, + "step": 69000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0859, + "step": 69100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0843, + "step": 69200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0939, + "step": 69300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0895, + "step": 69400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0911, + "step": 69500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0832, + "step": 69600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.085, + "step": 69700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.092, + "step": 69800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0909, + "step": 69900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0873, + "step": 70000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0859, + "step": 70100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0826, + "step": 70200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0832, + "step": 70300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.089, + "step": 70400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0834, + "step": 70500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0865, + "step": 70600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0873, + "step": 70700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0846, + "step": 70800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0857, + "step": 70900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0885, + "step": 71000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0854, + "step": 71100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0963, + "step": 71200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0833, + "step": 71300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0865, + "step": 71400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0865, + "step": 71500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0926, + "step": 71600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0859, + "step": 71700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0857, + "step": 71800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.087, + "step": 71900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0875, + "step": 72000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0846, + "step": 72100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.081, + "step": 72200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0831, + "step": 72300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0875, + "step": 72400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.082, + "step": 72500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0863, + "step": 72600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0893, + "step": 72700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0873, + "step": 72800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0873, + "step": 72900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0898, + "step": 73000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0837, + "step": 73100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0923, + "step": 73200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0813, + "step": 73300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0834, + "step": 73400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0839, + "step": 73500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0887, + "step": 73600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0818, + "step": 73700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0834, + "step": 73800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0868, + "step": 73900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0816, + "step": 74000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0802, + "step": 74100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0896, + "step": 74200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0874, + "step": 74300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0841, + "step": 74400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0832, + "step": 74500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0847, + "step": 74600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.086, + "step": 74700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0903, + "step": 74800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0813, + "step": 74900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0793, + "step": 75000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0889, + "step": 75100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0845, + "step": 75200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0922, + "step": 75300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0835, + "step": 75400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0795, + "step": 75500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.085, + "step": 75600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0769, + "step": 75700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0844, + "step": 75800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.09, + "step": 75900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0785, + "step": 76000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0865, + "step": 76100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0891, + "step": 76200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0843, + "step": 76300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0827, + "step": 76400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0851, + "step": 76500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0856, + "step": 76600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0924, + "step": 76700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0799, + "step": 76800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0833, + "step": 76900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0856, + "step": 77000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0845, + "step": 77100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0907, + "step": 77200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0842, + "step": 77300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0821, + "step": 77400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0864, + "step": 77500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0823, + "step": 77600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0844, + "step": 77700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0802, + "step": 77800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0901, + "step": 77900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0837, + "step": 78000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0862, + "step": 78100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0845, + "step": 78200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0805, + "step": 78300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0875, + "step": 78400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0827, + "step": 78500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0827, + "step": 78600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0837, + "step": 78700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0825, + "step": 78800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0836, + "step": 78900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0815, + "step": 79000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0813, + "step": 79100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0887, + "step": 79200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0817, + "step": 79300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0824, + "step": 79400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0819, + "step": 79500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.082, + "step": 79600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0815, + "step": 79700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.083, + "step": 79800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.088, + "step": 79900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0826, + "step": 80000 + }, + { + "epoch": 0.0, + "eval_loss": 0.069091796875, + "eval_runtime": 3095.1044, + "eval_samples_per_second": 363.388, + "eval_steps_per_second": 22.712, + "step": 80000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0816, + "step": 80100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0808, + "step": 80200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.09, + "step": 80300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0787, + "step": 80400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0847, + "step": 80500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0852, + "step": 80600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0827, + "step": 80700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0847, + "step": 80800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.085, + "step": 80900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0835, + "step": 81000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0761, + "step": 81100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0825, + "step": 81200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0841, + "step": 81300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0803, + "step": 81400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0798, + "step": 81500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0787, + "step": 81600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0836, + "step": 81700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.077, + "step": 81800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0832, + "step": 81900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.077, + "step": 82000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.082, + "step": 82100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0804, + "step": 82200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0877, + "step": 82300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0864, + "step": 82400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0841, + "step": 82500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0839, + "step": 82600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0797, + "step": 82700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0827, + "step": 82800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0785, + "step": 82900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0788, + "step": 83000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0807, + "step": 83100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0874, + "step": 83200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0813, + "step": 83300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0821, + "step": 83400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0834, + "step": 83500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0805, + "step": 83600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0746, + "step": 83700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0806, + "step": 83800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0775, + "step": 83900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0818, + "step": 84000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0843, + "step": 84100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0822, + "step": 84200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0827, + "step": 84300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0804, + "step": 84400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0835, + "step": 84500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0799, + "step": 84600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0835, + "step": 84700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0766, + "step": 84800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0834, + "step": 84900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0784, + "step": 85000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0824, + "step": 85100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0784, + "step": 85200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0796, + "step": 85300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0774, + "step": 85400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0797, + "step": 85500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.079, + "step": 85600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0777, + "step": 85700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0787, + "step": 85800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0748, + "step": 85900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0782, + "step": 86000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0798, + "step": 86100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0788, + "step": 86200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.08, + "step": 86300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0837, + "step": 86400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0786, + "step": 86500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0787, + "step": 86600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.074, + "step": 86700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0785, + "step": 86800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0818, + "step": 86900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0758, + "step": 87000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0798, + "step": 87100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0822, + "step": 87200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0754, + "step": 87300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0783, + "step": 87400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0802, + "step": 87500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0792, + "step": 87600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0797, + "step": 87700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0816, + "step": 87800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0795, + "step": 87900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0768, + "step": 88000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0864, + "step": 88100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0792, + "step": 88200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0767, + "step": 88300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0791, + "step": 88400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0812, + "step": 88500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0795, + "step": 88600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0769, + "step": 88700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.082, + "step": 88800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0797, + "step": 88900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0777, + "step": 89000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0746, + "step": 89100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.083, + "step": 89200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0827, + "step": 89300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0797, + "step": 89400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0752, + "step": 89500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0827, + "step": 89600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0781, + "step": 89700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0779, + "step": 89800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.079, + "step": 89900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.081, + "step": 90000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0789, + "step": 90100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.077, + "step": 90200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0745, + "step": 90300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0743, + "step": 90400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0816, + "step": 90500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0813, + "step": 90600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0808, + "step": 90700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0837, + "step": 90800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0783, + "step": 90900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0757, + "step": 91000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0778, + "step": 91100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0781, + "step": 91200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0786, + "step": 91300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0802, + "step": 91400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0781, + "step": 91500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0769, + "step": 91600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0809, + "step": 91700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0722, + "step": 91800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0815, + "step": 91900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0767, + "step": 92000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0756, + "step": 92100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0775, + "step": 92200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0781, + "step": 92300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0814, + "step": 92400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0752, + "step": 92500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0802, + "step": 92600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.074, + "step": 92700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.079, + "step": 92800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0826, + "step": 92900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0777, + "step": 93000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0731, + "step": 93100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0799, + "step": 93200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0806, + "step": 93300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.083, + "step": 93400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0773, + "step": 93500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0758, + "step": 93600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0765, + "step": 93700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0792, + "step": 93800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0814, + "step": 93900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0786, + "step": 94000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0769, + "step": 94100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.078, + "step": 94200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0796, + "step": 94300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.079, + "step": 94400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.078, + "step": 94500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0768, + "step": 94600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0736, + "step": 94700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0752, + "step": 94800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0788, + "step": 94900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.077, + "step": 95000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0716, + "step": 95100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0765, + "step": 95200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0753, + "step": 95300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.075, + "step": 95400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.073, + "step": 95500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0771, + "step": 95600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0826, + "step": 95700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0764, + "step": 95800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0772, + "step": 95900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0757, + "step": 96000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 96100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0803, + "step": 96200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0733, + "step": 96300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0752, + "step": 96400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0802, + "step": 96500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0746, + "step": 96600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0759, + "step": 96700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0798, + "step": 96800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0701, + "step": 96900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0739, + "step": 97000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.077, + "step": 97100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0753, + "step": 97200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0764, + "step": 97300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0764, + "step": 97400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0727, + "step": 97500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0774, + "step": 97600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0771, + "step": 97700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0744, + "step": 97800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0752, + "step": 97900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0741, + "step": 98000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0771, + "step": 98100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0834, + "step": 98200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0753, + "step": 98300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0726, + "step": 98400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0759, + "step": 98500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0722, + "step": 98600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0754, + "step": 98700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0772, + "step": 98800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0734, + "step": 98900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0708, + "step": 99000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0771, + "step": 99100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0762, + "step": 99200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.075, + "step": 99300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0765, + "step": 99400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 99500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0774, + "step": 99600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0758, + "step": 99700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0742, + "step": 99800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0729, + "step": 99900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0735, + "step": 100000 + }, + { + "epoch": 0.0, + "eval_loss": 0.062103271484375, + "eval_runtime": 3145.6437, + "eval_samples_per_second": 357.549, + "eval_steps_per_second": 22.347, + "step": 100000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0764, + "step": 100100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0765, + "step": 100200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0761, + "step": 100300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 100400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0767, + "step": 100500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0708, + "step": 100600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0697, + "step": 100700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0779, + "step": 100800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0771, + "step": 100900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0784, + "step": 101000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0718, + "step": 101100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0723, + "step": 101200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0812, + "step": 101300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0767, + "step": 101400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0755, + "step": 101500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0732, + "step": 101600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0709, + "step": 101700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0804, + "step": 101800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 101900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0744, + "step": 102000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0729, + "step": 102100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.074, + "step": 102200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0747, + "step": 102300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.074, + "step": 102400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0742, + "step": 102500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0695, + "step": 102600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0751, + "step": 102700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0744, + "step": 102800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0745, + "step": 102900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0703, + "step": 103000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0775, + "step": 103100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0719, + "step": 103200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0729, + "step": 103300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.072, + "step": 103400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0717, + "step": 103500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.074, + "step": 103600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0744, + "step": 103700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0676, + "step": 103800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0844, + "step": 103900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0718, + "step": 104000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0766, + "step": 104100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0723, + "step": 104200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0753, + "step": 104300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0799, + "step": 104400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0765, + "step": 104500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0662, + "step": 104600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0772, + "step": 104700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0682, + "step": 104800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0661, + "step": 104900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0729, + "step": 105000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0674, + "step": 105100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0764, + "step": 105200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0739, + "step": 105300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0732, + "step": 105400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0742, + "step": 105500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0722, + "step": 105600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0659, + "step": 105700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0722, + "step": 105800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.077, + "step": 105900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0724, + "step": 106000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0714, + "step": 106100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0719, + "step": 106200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0718, + "step": 106300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0721, + "step": 106400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0766, + "step": 106500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0693, + "step": 106600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0719, + "step": 106700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0719, + "step": 106800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0691, + "step": 106900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0768, + "step": 107000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0722, + "step": 107100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0692, + "step": 107300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.076, + "step": 107400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0744, + "step": 107500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0786, + "step": 107600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0717, + "step": 107700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0685, + "step": 107800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0721, + "step": 107900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0719, + "step": 108000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0674, + "step": 108100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0729, + "step": 108200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0689, + "step": 108300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.065, + "step": 108400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0721, + "step": 108500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0714, + "step": 108600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0725, + "step": 108700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0756, + "step": 108800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0716, + "step": 108900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.079, + "step": 109000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0713, + "step": 109100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0713, + "step": 109200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0739, + "step": 109300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 109400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0706, + "step": 109500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0715, + "step": 109600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0755, + "step": 109700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0757, + "step": 109800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0726, + "step": 109900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0708, + "step": 110000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0672, + "step": 110100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0732, + "step": 110200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0748, + "step": 110300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0746, + "step": 110400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0801, + "step": 110500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0683, + "step": 110600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0739, + "step": 110700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0705, + "step": 110800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0752, + "step": 110900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0685, + "step": 111000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0713, + "step": 111100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0698, + "step": 111200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0705, + "step": 111300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0735, + "step": 111400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0735, + "step": 111500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 111600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.075, + "step": 111700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.068, + "step": 111800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.073, + "step": 111900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0706, + "step": 112000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 112100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0711, + "step": 112200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0697, + "step": 112300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0721, + "step": 112400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 112500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0697, + "step": 112600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0705, + "step": 112700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0701, + "step": 112800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0732, + "step": 112900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 113000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.071, + "step": 113100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.07, + "step": 113200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0721, + "step": 113300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0738, + "step": 113400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0701, + "step": 113500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.072, + "step": 113600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0744, + "step": 113700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0705, + "step": 113800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0719, + "step": 113900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0642, + "step": 114000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0737, + "step": 114100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0759, + "step": 114200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0708, + "step": 114300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0672, + "step": 114400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0745, + "step": 114500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0682, + "step": 114600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.07, + "step": 114700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0678, + "step": 114800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0642, + "step": 114900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0747, + "step": 115000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0736, + "step": 115100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0705, + "step": 115200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0669, + "step": 115300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0662, + "step": 115400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0693, + "step": 115500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0727, + "step": 115600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0721, + "step": 115700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.073, + "step": 115800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0724, + "step": 115900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0681, + "step": 116000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0671, + "step": 116100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0689, + "step": 116200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0692, + "step": 116300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0712, + "step": 116400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0681, + "step": 116500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0712, + "step": 116600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0686, + "step": 116700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0666, + "step": 116800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0685, + "step": 116900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0684, + "step": 117000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0688, + "step": 117100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0708, + "step": 117200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0702, + "step": 117300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0727, + "step": 117400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0672, + "step": 117500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0681, + "step": 117600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0712, + "step": 117700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0743, + "step": 117800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0708, + "step": 117900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0689, + "step": 118000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0771, + "step": 118100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0703, + "step": 118200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.072, + "step": 118300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0676, + "step": 118400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0674, + "step": 118500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0679, + "step": 118600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.068, + "step": 118700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0675, + "step": 118800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0658, + "step": 118900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0699, + "step": 119000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0689, + "step": 119100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0751, + "step": 119200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0724, + "step": 119300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0687, + "step": 119400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.067, + "step": 119500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0722, + "step": 119600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0633, + "step": 119700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.067, + "step": 119800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0732, + "step": 119900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.067, + "step": 120000 + }, + { + "epoch": 0.0, + "eval_loss": 0.058624267578125, + "eval_runtime": 3109.0534, + "eval_samples_per_second": 361.757, + "eval_steps_per_second": 22.61, + "step": 120000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0644, + "step": 120100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0633, + "step": 120200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0626, + "step": 120300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0616, + "step": 120400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0595, + "step": 120500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0587, + "step": 120600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.059, + "step": 120700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0627, + "step": 120800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0616, + "step": 120900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0568, + "step": 121000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0607, + "step": 121100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0576, + "step": 121200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0588, + "step": 121300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.058, + "step": 121400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0616, + "step": 121500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0589, + "step": 121600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0625, + "step": 121700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0605, + "step": 121800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0609, + "step": 121900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0589, + "step": 122000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0617, + "step": 122100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.06, + "step": 122200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0622, + "step": 122300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0593, + "step": 122400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0598, + "step": 122500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0592, + "step": 122600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0578, + "step": 122700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0634, + "step": 122800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0628, + "step": 122900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0592, + "step": 123000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0601, + "step": 123100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0627, + "step": 123200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0582, + "step": 123300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0587, + "step": 123400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0569, + "step": 123500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0608, + "step": 123600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0589, + "step": 123700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0626, + "step": 123800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0598, + "step": 123900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0609, + "step": 124000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0618, + "step": 124100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0599, + "step": 124200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0597, + "step": 124300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0594, + "step": 124400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0583, + "step": 124500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0642, + "step": 124600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0586, + "step": 124700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0589, + "step": 124800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0593, + "step": 124900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.063, + "step": 125000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0605, + "step": 125100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.062, + "step": 125200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0587, + "step": 125300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0592, + "step": 125400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0584, + "step": 125500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0626, + "step": 125600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0643, + "step": 125700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0578, + "step": 125800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0586, + "step": 125900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.06, + "step": 126000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.059, + "step": 126100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.061, + "step": 126200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0618, + "step": 126300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0597, + "step": 126400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0608, + "step": 126500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0644, + "step": 126600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0562, + "step": 126700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0568, + "step": 126800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0581, + "step": 126900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0594, + "step": 127000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0568, + "step": 127100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.059, + "step": 127200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0612, + "step": 127300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0589, + "step": 127400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0594, + "step": 127500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0596, + "step": 127600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0596, + "step": 127700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.058, + "step": 127800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0602, + "step": 127900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0587, + "step": 128000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0595, + "step": 128100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0585, + "step": 128200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0568, + "step": 128300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0589, + "step": 128400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0589, + "step": 128500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0555, + "step": 128600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0595, + "step": 128700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0577, + "step": 128800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0597, + "step": 128900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0591, + "step": 129000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.056, + "step": 129100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0588, + "step": 129200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0585, + "step": 129300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0587, + "step": 129400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0588, + "step": 129500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0581, + "step": 129600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0588, + "step": 129700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0555, + "step": 129800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.058, + "step": 129900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0567, + "step": 130000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0579, + "step": 130100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0598, + "step": 130200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.06, + "step": 130300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.06, + "step": 130400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0566, + "step": 130500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0596, + "step": 130600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0579, + "step": 130700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0574, + "step": 130800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0571, + "step": 130900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0565, + "step": 131000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0578, + "step": 131100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0583, + "step": 131200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0563, + "step": 131300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0575, + "step": 131400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0558, + "step": 131500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0596, + "step": 131600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0576, + "step": 131700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0576, + "step": 131800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0562, + "step": 131900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0564, + "step": 132000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0578, + "step": 132100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0572, + "step": 132200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0572, + "step": 132300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0564, + "step": 132400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0604, + "step": 132500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0566, + "step": 132600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0564, + "step": 132700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0584, + "step": 132800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0573, + "step": 132900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0615, + "step": 133000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0573, + "step": 133100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0569, + "step": 133200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.057, + "step": 133300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0558, + "step": 133400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0565, + "step": 133500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0576, + "step": 133600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0581, + "step": 133700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0564, + "step": 133800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0602, + "step": 133900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0568, + "step": 134000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0566, + "step": 134100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0581, + "step": 134200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0562, + "step": 134300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0561, + "step": 134400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0566, + "step": 134500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0575, + "step": 134600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0567, + "step": 134700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0558, + "step": 134800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0552, + "step": 134900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0555, + "step": 135000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0584, + "step": 135100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0569, + "step": 135200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0559, + "step": 135300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0561, + "step": 135400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0542, + "step": 135500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0578, + "step": 135600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0561, + "step": 135700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0552, + "step": 135800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0563, + "step": 135900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0588, + "step": 136000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0561, + "step": 136100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0562, + "step": 136200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0585, + "step": 136300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0558, + "step": 136400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0552, + "step": 136500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0552, + "step": 136600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.058, + "step": 136700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0561, + "step": 136800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0561, + "step": 136900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0578, + "step": 137000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.054, + "step": 137100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0551, + "step": 137200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0571, + "step": 137300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 137400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 137500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0546, + "step": 137600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0573, + "step": 137700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0557, + "step": 137800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0604, + "step": 137900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0559, + "step": 138000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0557, + "step": 138100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0567, + "step": 138200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0585, + "step": 138300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0544, + "step": 138400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0579, + "step": 138500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 138600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.058, + "step": 138700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0576, + "step": 138800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0554, + "step": 138900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.058, + "step": 139000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0525, + "step": 139100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0536, + "step": 139200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0538, + "step": 139300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0554, + "step": 139400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0545, + "step": 139500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0552, + "step": 139600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0557, + "step": 139700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0568, + "step": 139800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0544, + "step": 139900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0562, + "step": 140000 + }, + { + "epoch": 1.0, + "eval_loss": 0.04754638671875, + "eval_runtime": 3429.7025, + "eval_samples_per_second": 327.936, + "eval_steps_per_second": 20.496, + "step": 140000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0557, + "step": 140100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0569, + "step": 140200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.055, + "step": 140300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0549, + "step": 140400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0561, + "step": 140500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0575, + "step": 140600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0558, + "step": 140700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 140800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 140900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0557, + "step": 141000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.053, + "step": 141100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0544, + "step": 141200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0574, + "step": 141300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 141400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0552, + "step": 141500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0559, + "step": 141600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0549, + "step": 141700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 141800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0566, + "step": 141900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0525, + "step": 142000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 142100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 142200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0562, + "step": 142300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0553, + "step": 142400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0549, + "step": 142500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0561, + "step": 142600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0544, + "step": 142700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0557, + "step": 142800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.054, + "step": 142900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0533, + "step": 143000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0553, + "step": 143100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0569, + "step": 143200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0528, + "step": 143300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.056, + "step": 143400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0556, + "step": 143500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0544, + "step": 143600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0545, + "step": 143700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0559, + "step": 143800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0545, + "step": 143900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0542, + "step": 144000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0563, + "step": 144100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0542, + "step": 144200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 144300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.056, + "step": 144400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 144500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0532, + "step": 144600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 144700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0548, + "step": 144800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0543, + "step": 144900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0554, + "step": 145000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0541, + "step": 145100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 145200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0541, + "step": 145300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0525, + "step": 145400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0549, + "step": 145500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.056, + "step": 145600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.051, + "step": 145700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0563, + "step": 145800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 145900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.055, + "step": 146000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0536, + "step": 146100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.054, + "step": 146200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0546, + "step": 146300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0543, + "step": 146400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 146500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0551, + "step": 146600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.053, + "step": 146700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0542, + "step": 146800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0543, + "step": 146900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0574, + "step": 147000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 147100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0531, + "step": 147200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 147300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0552, + "step": 147400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0558, + "step": 147500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0565, + "step": 147600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 147700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0533, + "step": 147800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 147900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 148000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0548, + "step": 148100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0545, + "step": 148200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 148300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0524, + "step": 148400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0538, + "step": 148500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0536, + "step": 148600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0549, + "step": 148700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0547, + "step": 148800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0546, + "step": 148900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0576, + "step": 149000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 149100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 149200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0528, + "step": 149300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0535, + "step": 149400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 149500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0564, + "step": 149600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0546, + "step": 149700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0538, + "step": 149800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 149900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.054, + "step": 150000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 150100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0523, + "step": 150200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0535, + "step": 150300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0522, + "step": 150400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0527, + "step": 150500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.055, + "step": 150600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 150700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0536, + "step": 150800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0524, + "step": 150900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0548, + "step": 151000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 151100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0501, + "step": 151200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0523, + "step": 151300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.051, + "step": 151400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0535, + "step": 151500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0519, + "step": 151600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0509, + "step": 151700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0541, + "step": 151800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0548, + "step": 151900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0533, + "step": 152000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0529, + "step": 152100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0552, + "step": 152200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0549, + "step": 152300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0546, + "step": 152400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0553, + "step": 152500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0533, + "step": 152600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0562, + "step": 152700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0514, + "step": 152800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0505, + "step": 152900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0549, + "step": 153000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0539, + "step": 153100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 153200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0555, + "step": 153300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0531, + "step": 153400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0551, + "step": 153500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 153600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0539, + "step": 153700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 153800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 153900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0519, + "step": 154000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0525, + "step": 154100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0525, + "step": 154200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 154300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.054, + "step": 154400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0529, + "step": 154500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0538, + "step": 154600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0524, + "step": 154700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0528, + "step": 154800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 154900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0535, + "step": 155000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0554, + "step": 155100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0524, + "step": 155200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0532, + "step": 155300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0528, + "step": 155400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0514, + "step": 155500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 155600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0532, + "step": 155700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 155800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 155900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.053, + "step": 156000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 156100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0522, + "step": 156200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 156300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 156400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0541, + "step": 156500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0528, + "step": 156600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0514, + "step": 156700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0536, + "step": 156800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0535, + "step": 156900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0535, + "step": 157000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0514, + "step": 157100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0525, + "step": 157200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0531, + "step": 157300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 157400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 157500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 157600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0494, + "step": 157700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0533, + "step": 157800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 157900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 158000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 158100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 158200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 158300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0532, + "step": 158400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 158500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 158600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0523, + "step": 158700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0533, + "step": 158800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0527, + "step": 158900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0535, + "step": 159000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.054, + "step": 159100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 159200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0527, + "step": 159300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 159400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 159500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0538, + "step": 159600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0526, + "step": 159700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 159800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0506, + "step": 159900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 160000 + }, + { + "epoch": 1.0, + "eval_loss": 0.044464111328125, + "eval_runtime": 3391.5213, + "eval_samples_per_second": 331.628, + "eval_steps_per_second": 20.727, + "step": 160000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0509, + "step": 160100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 160200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0543, + "step": 160300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.053, + "step": 160400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 160500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 160600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0544, + "step": 160700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 160800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 160900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0494, + "step": 161000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0542, + "step": 161100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 161200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0537, + "step": 161300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 161400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0525, + "step": 161500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 161600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0544, + "step": 161700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0523, + "step": 161800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.053, + "step": 161900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 162000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 162100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0526, + "step": 162200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0549, + "step": 162300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 162400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 162500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0513, + "step": 162600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0514, + "step": 162700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 162800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 162900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 163000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0529, + "step": 163100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0513, + "step": 163200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 163300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 163400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0518, + "step": 163500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0533, + "step": 163600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 163700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0484, + "step": 163800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0505, + "step": 163900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 164000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 164100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 164200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 164300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 164400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 164500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0509, + "step": 164600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0505, + "step": 164700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 164800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0497, + "step": 164900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 165000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0529, + "step": 165100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 165200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 165300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 165400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 165500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 165600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0487, + "step": 165700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0522, + "step": 165800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0519, + "step": 165900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 166000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0493, + "step": 166100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 166200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 166300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0481, + "step": 166400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0527, + "step": 166500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0494, + "step": 166600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0506, + "step": 166700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0506, + "step": 166800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0527, + "step": 166900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 167000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 167100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 167200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0529, + "step": 167300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 167400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 167500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0498, + "step": 167600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0498, + "step": 167700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 167800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 167900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0513, + "step": 168000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 168100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0497, + "step": 168200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 168300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0517, + "step": 168400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0534, + "step": 168500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0524, + "step": 168600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 168700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 168800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0505, + "step": 168900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.051, + "step": 169000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 169100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 169200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.053, + "step": 169300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 169400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0527, + "step": 169500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0522, + "step": 169600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 169700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 169800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0506, + "step": 169900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 170000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0498, + "step": 170100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0528, + "step": 170200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0542, + "step": 170300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0531, + "step": 170400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0493, + "step": 170500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 170600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0532, + "step": 170700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 170800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0501, + "step": 170900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0528, + "step": 171000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 171100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 171200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 171300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 171400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 171500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 171600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 171700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0518, + "step": 171800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 171900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0522, + "step": 172000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 172100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 172200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 172300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0518, + "step": 172400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 172500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 172600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 172700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 172800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0498, + "step": 172900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 173000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 173100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 173200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 173300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 173400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0519, + "step": 173500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 173600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 173700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 173800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 173900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 174000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0505, + "step": 174100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 174200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0498, + "step": 174300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 174400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 174500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 174600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 174700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 174800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 174900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0518, + "step": 175000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0531, + "step": 175100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 175200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0495, + "step": 175300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 175400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 175500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 175600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 175700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 175800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 175900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 176000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 176100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0497, + "step": 176200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 176300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0493, + "step": 176400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 176500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0501, + "step": 176600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0493, + "step": 176700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0513, + "step": 176800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0525, + "step": 176900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 177000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 177100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 177200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 177300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 177400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 177500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0533, + "step": 177600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0505, + "step": 177700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 177800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0484, + "step": 177900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0527, + "step": 178000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 178100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0497, + "step": 178200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 178300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0501, + "step": 178400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 178500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0487, + "step": 178600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 178700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 178800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0524, + "step": 178900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 179000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 179100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 179200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0493, + "step": 179300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 179400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 179500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 179600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 179700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 179800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0501, + "step": 179900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 180000 + }, + { + "epoch": 1.0, + "eval_loss": 0.0428466796875, + "eval_runtime": 3293.9688, + "eval_samples_per_second": 341.449, + "eval_steps_per_second": 21.341, + "step": 180000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 180100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 180200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0518, + "step": 180300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 180400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0518, + "step": 180500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 180600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 180700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 180800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 180900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 181000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 181100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 181200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0497, + "step": 181300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 181400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 181500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 181600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 181700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 181800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0494, + "step": 181900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 182000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 182100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 182200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0497, + "step": 182300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 182400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 182500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0509, + "step": 182600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0516, + "step": 182700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 182800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0487, + "step": 182900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 183000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0484, + "step": 183100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 183200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 183300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 183400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 183500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0493, + "step": 183600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 183700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 183800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0484, + "step": 183900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 184000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 184100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 184200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 184300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0494, + "step": 184400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 184500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 184600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 184700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 184800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 184900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0495, + "step": 185000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 185100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0515, + "step": 185200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 185300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 185400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0495, + "step": 185500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 185600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 185700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 185800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 185900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 186000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 186100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 186200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 186300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 186400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 186500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 186600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.05, + "step": 186700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 186800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 186900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 187000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 187100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 187200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 187300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 187400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 187500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0495, + "step": 187600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0511, + "step": 187700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 187800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 187900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0508, + "step": 188000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 188100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 188200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 188300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 188400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 188500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 188600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 188700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0465, + "step": 188800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 188900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.052, + "step": 189000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 189100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 189200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 189300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0484, + "step": 189400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 189500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0513, + "step": 189600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 189700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 189800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 189900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 190000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 190100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 190200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0487, + "step": 190300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 190400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 190500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 190600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0487, + "step": 190700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 190800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 190900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0498, + "step": 191000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 191100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 191200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 191300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0484, + "step": 191400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0522, + "step": 191500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 191600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 191700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 191800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 191900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 192000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 192100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 192200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 192300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 192400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 192500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 192600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 192700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 192800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 192900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 193000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 193100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 193200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 193300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0481, + "step": 193400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 193500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 193600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 193700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 193800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 193900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 194000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0512, + "step": 194100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 194200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 194300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 194400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 194500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 194600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 194700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 194800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 194900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 195000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 195100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0495, + "step": 195200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 195300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0521, + "step": 195400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 195500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 195600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 195700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 195800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 195900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0493, + "step": 196000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 196100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0481, + "step": 196200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 196300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 196400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 196500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 196600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 196700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 196800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 196900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 197000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 197100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 197200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 197300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 197400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 197500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 197600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 197700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 197800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 197900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0506, + "step": 198000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0484, + "step": 198100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 198200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 198300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 198400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 198500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 198600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 198700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 198800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 198900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 199000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 199100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 199200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 199300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0499, + "step": 199400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 199500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 199600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 199700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 199800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 199900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 200000 + }, + { + "epoch": 1.0, + "eval_loss": 0.041534423828125, + "eval_runtime": 3291.5411, + "eval_samples_per_second": 341.701, + "eval_steps_per_second": 21.357, + "step": 200000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 200100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 200200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 200300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 200400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 200500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 200600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 200700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0495, + "step": 200800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 200900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 201000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0481, + "step": 201100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 201200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 201300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 201400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 201500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 201600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 201700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 201800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 201900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 202000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 202100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 202200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 202300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 202400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 202500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 202600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 202700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 202800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0487, + "step": 202900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 203000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 203100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 203200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 203300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 203400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0465, + "step": 203500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 203600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 203700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 203800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 203900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 204000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 204100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 204200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 204300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 204400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 204500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 204600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0465, + "step": 204700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0497, + "step": 204800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 204900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 205000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 205100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 205200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 205300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 205400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 205500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0497, + "step": 205600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 205700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 205800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0496, + "step": 205900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 206000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 206100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 206200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 206300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 206400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0481, + "step": 206500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 206600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 206700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 206800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 206900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 207000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 207100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 207200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 207300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0484, + "step": 207400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 207500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 207600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 207700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 207800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 207900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 208000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0501, + "step": 208100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 208200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 208300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 208400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0494, + "step": 208500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 208600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 208700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 208800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 208900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0481, + "step": 209000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 209100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 209200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 209300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 209400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 209500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 209600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 209700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 209800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 209900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 210000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 210100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 210200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 210300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 210400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 210500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 210600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 210700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 210800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 210900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 211000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 211100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 211200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 211300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 211400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 211500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 211600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 211700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 211800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 211900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 212000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 212100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 212200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 212300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 212400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 212500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 212600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 212700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 212800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 212900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 213000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 213100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 213200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 213300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 213400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 213500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 213600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 213700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 213800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 213900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 214000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 214100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 214200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.048, + "step": 214300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 214400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 214500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 214600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 214700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 214800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 214900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 215000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 215100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 215200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 215300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 215400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 215500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 215600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 215700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 215800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 215900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 216000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 216100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 216200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 216300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 216400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 216500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 216600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 216700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 216800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 216900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 217000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 217100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 217200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 217300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 217400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 217500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 217600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0478, + "step": 217700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0482, + "step": 217800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0479, + "step": 217900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 218000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 218100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 218200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 218300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 218400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 218500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 218600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 218700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 218800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 218900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 219000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 219100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 219200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 219300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 219400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 219500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 219600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0506, + "step": 219700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 219800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 219900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 220000 + }, + { + "epoch": 1.0, + "eval_loss": 0.03997802734375, + "eval_runtime": 3291.9413, + "eval_samples_per_second": 341.659, + "eval_steps_per_second": 21.354, + "step": 220000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 220100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 220200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 220300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 220400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0491, + "step": 220500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 220600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 220700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 220800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 220900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 221000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 221100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 221200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 221300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 221400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 221500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0481, + "step": 221600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 221700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 221800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 221900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 222000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 222100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 222200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 222300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 222400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 222500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 222600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 222700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 222800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0488, + "step": 222900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 223000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 223100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 223200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 223300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 223400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 223500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 223600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 223700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 223800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 223900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 224000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 224100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 224200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 224300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 224400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 224500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0498, + "step": 224600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 224700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 224800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 224900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 225000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 225100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0504, + "step": 225200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 225300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 225400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 225500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 225600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 225700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 225800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 225900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 226000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 226100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 226200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 226300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 226400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 226500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 226600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 226700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0465, + "step": 226800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 226900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 227000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 227100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 227200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 227300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 227400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 227500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.049, + "step": 227600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 227700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 227800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 227900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 228000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 228100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 228200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 228300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 228400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 228500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 228600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 228700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 228800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 228900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 229000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 229100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0486, + "step": 229200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 229300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 229400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 229500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 229600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 229700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 229800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 229900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 230000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 230100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 230200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 230300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 230400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 230500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 230600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 230700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0489, + "step": 230800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 230900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 231000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 231100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 231200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 231300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 231400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 231500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0465, + "step": 231600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 231700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 231800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 231900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 232000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 232100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 232200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 232300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 232400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 232500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 232600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 232700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 232800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0466, + "step": 232900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0492, + "step": 233000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0485, + "step": 233100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 233200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 233300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 233400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 233500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 233600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 233700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 233800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 233900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 234000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 234100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 234200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 234300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0483, + "step": 234400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 234500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 234600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 234700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 234800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 234900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 235000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 235100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 235200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 235300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 235400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 235500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 235600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 235700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 235800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 235900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 236000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 236100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 236200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 236300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 236400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 236500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 236600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 236700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 236800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 236900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 237000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 237100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0477, + "step": 237200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 237300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 237400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 237500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 237600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 237700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 237800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 237900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 238000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 238100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 238200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 238300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 238400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 238500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 238600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 238700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 238800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 238900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 239000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 239100 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 239200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 239300 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 239400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 239500 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 239600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 239700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 239800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 239900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 240000 + }, + { + "epoch": 1.0, + "eval_loss": 0.0391845703125, + "eval_runtime": 3284.0325, + "eval_samples_per_second": 342.482, + "eval_steps_per_second": 21.405, + "step": 240000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 240100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 240200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 240300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 240400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 240500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 240600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 240700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 240800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0463, + "step": 240900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 241000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0465, + "step": 241100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 241200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 241300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 241400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 241500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 241600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 241700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 241800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 241900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0465, + "step": 242000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 242100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 242200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 242300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 242400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 242500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 242600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 242700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0465, + "step": 242800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 242900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 243000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0471, + "step": 243100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 243200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 243300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 243400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 243500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 243600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 243700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 243800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 243900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 244000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 244100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 244200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 244300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 244400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 244500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 244600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 244700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 244800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 244900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 245000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 245100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 245200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 245300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 245400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 245500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 245600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 245700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 245800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 245900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 246000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 246100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 246200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 246300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 246400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 246500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 246600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 246700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 246800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 246900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 247000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 247100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 247200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 247300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 247400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 247500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 247600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 247700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 247800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 247900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 248000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 248100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 248200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 248300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 248400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 248500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 248600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 248700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 248800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 248900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 249000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 249100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 249200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 249300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.047, + "step": 249400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 249500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 249600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 249700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 249800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 249900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 250000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 250100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0464, + "step": 250200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 250300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 250400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 250500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 250600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 250700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 250800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 250900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 251000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 251100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 251200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 251300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 251400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 251500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 251600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 251700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 251800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 251900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 252000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 252100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 252200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 252300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 252400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 252500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 252600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 252700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 252800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0462, + "step": 252900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 253000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 253100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 253200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 253300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 253400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 253500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 253600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 253700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 253800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 253900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 254000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 254100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 254200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 254300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 254400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 254500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0469, + "step": 254600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 254700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 254800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 254900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 255000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 255100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 255200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 255300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 255400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 255500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 255600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 255700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 255800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 255900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 256000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 256100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 256200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 256300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 256400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 256500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 256600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 256700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 256800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 256900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 257000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 257100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 257200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 257300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 257400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 257500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 257600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 257700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 257800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 257900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 258000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 258100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 258200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 258300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 258400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 258500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 258600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 258700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0456, + "step": 258800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 258900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 259000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 259100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 259200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 259300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 259400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 259500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 259600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 259700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 259800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 259900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 260000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03851318359375, + "eval_runtime": 3581.0286, + "eval_samples_per_second": 314.078, + "eval_steps_per_second": 19.63, + "step": 260000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 260100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 260200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 260300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 260400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 260500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 260600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 260700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 260800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 260900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 261000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 261100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 261200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 261300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 261400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 261500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 261600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 261700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 261800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 261900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 262000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 262100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0474, + "step": 262200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 262300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 262400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 262500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 262600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 262700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 262800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 262900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 263000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 263100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 263200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 263300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 263400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 263500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 263600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 263700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 263800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 263900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 264000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 264100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 264200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 264300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 264400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 264500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 264600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 264700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 264800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 264900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0452, + "step": 265000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 265100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 265200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 265300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 265400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 265500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 265600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 265700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 265800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 265900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0458, + "step": 266000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 266100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 266200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 266300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 266400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 266500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 266600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 266700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 266800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 266900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 267000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 267100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 267200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 267300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 267400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 267500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 267600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 267700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 267800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 267900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 268000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 268100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 268200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 268300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 268400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 268500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 268600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 268700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 268800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 268900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 269000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 269100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 269200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 269300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 269400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 269500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 269600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 269700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 269800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 269900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 270000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 270100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 270200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 270300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 270400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 270500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 270600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 270700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 270800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 270900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 271000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 271100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 271200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 271300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 271400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 271500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 271600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 271700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 271800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 271900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 272000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 272100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 272200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 272300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 272400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 272500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 272600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 272700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 272800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 272900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 273000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 273100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 273200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 273300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 273400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 273500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 273600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 273700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 273800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 273900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 274000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 274100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 274200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 274300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 274400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 274500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 274600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 274700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0449, + "step": 274800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 274900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 275000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 275100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 275200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 275300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 275400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 275500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.046, + "step": 275600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 275700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 275800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 275900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 276000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 276100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 276200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 276300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 276400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 276500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 276600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 276700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 276800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 276900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 277000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 277100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 277200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 277300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 277400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 277500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 277600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 277700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 277800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 277900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 278000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 278100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 278200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 278300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 278400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 278500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 278600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 278700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 278800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 278900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 279000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 279100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 279200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 279300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 279400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 279500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 279600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 279700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0455, + "step": 279800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 279900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 280000 + }, + { + "epoch": 0.0, + "eval_loss": 0.036956787109375, + "eval_runtime": 3481.1002, + "eval_samples_per_second": 323.094, + "eval_steps_per_second": 20.194, + "step": 280000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 280100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 280200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 280300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 280400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 280500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 280600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 280700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 280800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 280900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 281000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 281100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 281200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 281300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 281400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 281500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 281600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 281700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 281800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 281900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 282000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 282100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 282200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 282300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 282400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 282500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 282600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 282700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 282800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 282900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 283000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 283100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 283200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 283300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 283400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 283500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 283600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 283700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 283800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 283900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 284000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 284100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.045, + "step": 284200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 284300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 284400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 284500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 284600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 284700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 284800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 284900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 285000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0446, + "step": 285100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0442, + "step": 285200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 285300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 285400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 285500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 285600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 285700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 285800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 285900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 286000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 286100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 286200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 286300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 286400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 286500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 286600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 286700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 286800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 286900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 287000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 287100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 287200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 287300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 287400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 287500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 287600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 287700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 287800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 287900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 288000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 288100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 288200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 288300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 288400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 288500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 288600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 288700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0453, + "step": 288800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 288900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 289000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0467, + "step": 289100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 289200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 289300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 289400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 289500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 289600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 289700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 289800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 289900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 290000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 290100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 290200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 290300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 290400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 290500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 290600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 290700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 290800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 290900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 291000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 291100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 291200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 291300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 291400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 291500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 291600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 291700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 291800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 291900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 292000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 292100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 292200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 292300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 292400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 292500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 292600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 292700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 292800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 292900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 293000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 293100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 293200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 293300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 293400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 293500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 293600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 293700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 293800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 293900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 294000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 294100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 294200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 294300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 294400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 294500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 294600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 294700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 294800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 294900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 295000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 295100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 295200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 295300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 295400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 295500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 295600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 295700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 295800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 295900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 296000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 296100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 296200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 296300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 296400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 296500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 296600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 296700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 296800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0447, + "step": 296900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 297000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 297100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 297200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 297300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 297400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 297500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 297600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 297700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 297800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 297900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 298000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 298100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 298200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 298300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 298400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 298500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 298600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 298700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 298800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 298900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 299000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 299100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 299200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 299300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 299400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 299500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 299600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 299700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 299800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 299900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 300000 + }, + { + "epoch": 0.0, + "eval_loss": 0.037200927734375, + "eval_runtime": 3479.7507, + "eval_samples_per_second": 323.219, + "eval_steps_per_second": 20.201, + "step": 300000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 300100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 300200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 300300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 300400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 300500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 300600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 300700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 300800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 300900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 301000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 301100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 301200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 301300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 301400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 301500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 301600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 301700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 301800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 301900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 302000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 302100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 302200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 302300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 302400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 302500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 302600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 302700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 302800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 302900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 303000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 303100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 303200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 303300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 303400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 303500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 303600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 303700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 303800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 303900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 304000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 304100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 304200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 304300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 304400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0433, + "step": 304500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 304600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 304700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 304800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 304900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 305000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 305100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 305200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 305300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 305400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 305500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 305600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 305700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 305800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 305900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 306000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 306100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 306200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 306300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 306400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 306500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 306600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 306700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 306800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 306900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 307000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 307100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 307200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 307300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 307400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 307500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 307600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 307700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 307800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 307900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 308000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 308100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 308200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 308300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 308400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 308500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 308600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 308700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 308800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 308900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 309000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 309100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 309200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 309300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 309400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 309500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 309600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 309700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 309800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 309900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 310000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 310100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 310200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 310300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 310400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 310500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 310600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 310700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 310800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0443, + "step": 310900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 311000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 311100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 311200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 311300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 311400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 311500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 311600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 311700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 311800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 311900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 312000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 312100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 312200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 312300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 312400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 312500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 312600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 312700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 312800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 312900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 313000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 313100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 313200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 313300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 313400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 313500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 313600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 313700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 313800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 313900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 314000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 314100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 314200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 314300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 314400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 314500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 314600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 314700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 314800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 314900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 315000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 315100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 315200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 315300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 315400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 315500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 315600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 315700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 315800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 315900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 316000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 316100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 316200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 316300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 316400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 316500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 316600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 316700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 316800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 316900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 317000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 317100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 317200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 317300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 317400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 317500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 317600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 317700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 317800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 317900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 318000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 318100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 318200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 318300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 318400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 318500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 318600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 318700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 318800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 318900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 319000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0448, + "step": 319100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 319200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 319300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 319400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 319500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 319600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 319700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0432, + "step": 319800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 319900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 320000 + }, + { + "epoch": 0.0, + "eval_loss": 0.036041259765625, + "eval_runtime": 3349.372, + "eval_samples_per_second": 335.801, + "eval_steps_per_second": 20.988, + "step": 320000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 320100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 320200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 320300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 320400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 320500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 320600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 320700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 320800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 320900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 321000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 321100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 321200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 321300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 321400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 321500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 321600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 321700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 321800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 321900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 322000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 322100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 322200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 322300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 322400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 322500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 322600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 322700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 322800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 322900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 323000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 323100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 323200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 323300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 323400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 323500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 323600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 323700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 323800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 323900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 324000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 324100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 324200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 324300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 324400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 324500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 324600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 324700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 324800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 324900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 325000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 325100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0437, + "step": 325200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 325300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 325400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 325500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 325600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 325700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 325800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 325900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 326000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 326100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 326200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 326300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 326400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 326500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 326600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 326700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 326800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 326900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 327000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 327100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 327200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 327300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 327400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 327500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 327600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 327700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 327800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 327900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 328000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 328100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 328200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 328300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 328400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 328500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 328600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 328700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 328800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 328900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 329000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 329100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 329200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 329300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 329400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 329500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 329600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 329700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 329800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 329900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 330000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 330100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0434, + "step": 330200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 330300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 330400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 330500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 330600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 330700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 330800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 330900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 331000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 331100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 331200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 331300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 331400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 331500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 331600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 331700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 331800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 331900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 332000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 332100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 332200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 332300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 332400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 332500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 332600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 332700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 332800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 332900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 333000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 333100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 333200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 333300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 333400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 333500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 333600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 333700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 333800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 333900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 334000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 334100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 334200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 334300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 334400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 334500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 334600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 334700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 334800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 334900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 335000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 335100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 335200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 335300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 335400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 335500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 335600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 335700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 335800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 335900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 336000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 336100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 336200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 336300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 336400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 336500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 336600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 336700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 336800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 336900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 337000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 337100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 337200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 337300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 337400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 337500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 337600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 337700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 337800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 337900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 338000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 338100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 338200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 338300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 338400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 338500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 338600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 338700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 338800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0429, + "step": 338900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 339000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 339100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 339200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 339300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 339400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 339500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 339600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 339700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 339800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 339900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 340000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0361328125, + "eval_runtime": 3382.3623, + "eval_samples_per_second": 332.526, + "eval_steps_per_second": 20.783, + "step": 340000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 340100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 340200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 340300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 340400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 340500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 340600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 340700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 340800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 340900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 341000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 341100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 341200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 341300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 341400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 341500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 341600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 341700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 341800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 341900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 342000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 342100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 342200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 342300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 342400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 342500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 342600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 342700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 342800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 342900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 343000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 343100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 343200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 343300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 343400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 343500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 343600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 343700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 343800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 343900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 344000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 344100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 344200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 344300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 344400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 344500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 344600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 344700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 344800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 344900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 345000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 345100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 345200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 345300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 345400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 345500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 345600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 345700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 345800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 345900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 346000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 346100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 346200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 346300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 346400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 346500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 346600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 346700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 346800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 346900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 347000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 347100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 347200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 347300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 347400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 347500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 347600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 347700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 347800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 347900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 348000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 348100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 348200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 348300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 348400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 348500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 348600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 348700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 348800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 348900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 349000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 349100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0438, + "step": 349200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 349300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 349400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 349500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 349600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 349700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 349800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 349900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 350000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 350100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 350200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 350300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 350400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 350500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 350600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 350700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 350800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 350900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 351000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 351100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 351200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 351300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 351400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 351500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 351600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 351700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 351800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 351900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 352000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 352100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 352200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 352300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 352400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 352500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 352600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 352700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 352800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 352900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 353000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 353100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 353200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 353300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 353400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 353500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 353600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 353700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 353800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 353900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 354000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 354100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 354200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 354300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 354400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 354500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 354600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 354700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 354800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 354900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 355000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 355100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 355200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 355300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 355400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 355500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 355600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 355700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 355800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 355900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 356000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 356100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 356200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 356300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 356400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 356500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 356600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 356700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 356800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 356900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 357000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 357100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 357200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 357300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 357400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 357500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 357600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 357700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 357800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 357900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 358000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 358100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 358200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 358300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 358400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 358500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 358600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 358700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 358800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 358900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 359000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 359100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 359200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 359300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 359400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 359500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 359600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 359700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 359800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 359900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 360000 + }, + { + "epoch": 0.0, + "eval_loss": 0.035430908203125, + "eval_runtime": 3371.4761, + "eval_samples_per_second": 333.6, + "eval_steps_per_second": 20.85, + "step": 360000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 360100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 360200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 360300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 360400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 360500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 360600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 360700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 360800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 360900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 361000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 361100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 361200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 361300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 361400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 361500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 361600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 361700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 361800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 361900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 362000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 362100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 362200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 362300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 362400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 362500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 362600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 362700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 362800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 362900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 363000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 363100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 363200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.042, + "step": 363300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 363400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 363500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 363600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 363700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 363800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 363900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 364000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 364100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 364200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 364300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.043, + "step": 364400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 364500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 364600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 364700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 364800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 364900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 365000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 365100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 365200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 365300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 365400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 365500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 365600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 365700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 365800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 365900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 366000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 366100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 366200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 366300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 366400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 366500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 366600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 366700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 366800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 366900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0428, + "step": 367000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 367100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 367200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 367300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 367400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 367500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0422, + "step": 367600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 367700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 367800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 367900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 368000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 368100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 368200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 368300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 368400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 368500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 368600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 368700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 368800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 368900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 369000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 369100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 369200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 369300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 369400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 369500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 369600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 369700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 369800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 369900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 370000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 370100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 370200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 370300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 370400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 370500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 370600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 370700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 370800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 370900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 371000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 371100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 371200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 371300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 371400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 371500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 371600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 371700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 371800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 371900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 372000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 372100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 372200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 372300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 372400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 372500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 372600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 372700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 372800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 372900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 373000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 373100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 373200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 373300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 373400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 373500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 373600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 373700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 373800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 373900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 374000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 374100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 374200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0435, + "step": 374300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 374400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 374500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 374600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 374700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 374800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 374900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 375000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 375100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 375200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 375300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 375400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 375500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 375600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 375700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0425, + "step": 375800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 375900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 376000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 376100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 376200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 376300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 376400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 376500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 376600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 376700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 376800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0441, + "step": 376900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 377000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 377100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 377200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 377300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 377400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 377500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 377600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 377700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 377800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 377900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 378000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 378100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 378200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 378300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 378400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 378500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 378600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 378700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 378800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 378900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 379000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 379100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 379200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 379300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 379400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 379500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 379600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 379700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 379800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 379900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 380000 + }, + { + "epoch": 0.0, + "eval_loss": 0.035675048828125, + "eval_runtime": 3845.7026, + "eval_samples_per_second": 292.462, + "eval_steps_per_second": 18.279, + "step": 380000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 380100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 380200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 380300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 380400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 380500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 380600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 380700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 380800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 380900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 381000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0421, + "step": 381100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 381200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 381300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 381400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 381500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 381600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 381700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 381800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 381900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 382000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 382100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0427, + "step": 382200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 382300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 382400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 382500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 382600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 382700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 382800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 382900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 383000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 383100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0431, + "step": 383200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 383300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 383400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 383500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 383600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 383700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 383800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 383900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 384100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 384200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 384300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 384400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 384500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 384600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 384700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 384800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 384900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 385000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 385100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 385200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 385300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 385400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 385500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 385600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 385700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 385800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 385900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 386000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 386100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 386200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 386300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 386400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 386500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 386600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 386700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 386800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 386900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 387000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 387100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 387200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 387300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 387400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 387500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 387600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 387700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 387800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 387900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 388000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 388100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 388200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 388300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 388400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 388500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 388600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 388700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 388800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 388900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 389000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 389100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 389200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 389300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 389400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 389500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 389600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 389700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 389800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 389900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 390000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 390100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 390200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 390300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 390400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 390500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 390600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 390700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 390800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 390900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 391000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 391100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 391200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 391300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 391400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 391500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 391600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 391700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 391800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 391900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 392000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 392100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 392200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 392300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 392400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 392500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 392600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 392700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 392800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 392900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 393000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 393100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 393200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 393300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 393400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 393500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 393600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 393700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 393800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 393900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 394000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 394100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 394200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 394300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 394400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 394500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 394600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0426, + "step": 394700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 394800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 394900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 395000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 395100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 395200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 395300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 395400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 395500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 395600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 395700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 395800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 395900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 396000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 396100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 396200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 396300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 396400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 396500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 396600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 396700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 396800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 396900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 397000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 397100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 397200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 397300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 397400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 397500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 397600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 397700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 397800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 397900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 398000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 398100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 398200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 398300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 398400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 398500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 398600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 398700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 398800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 398900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 399000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 399100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 399200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 399300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 399400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 399500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 399600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 399700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 399800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 399900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 400000 + }, + { + "epoch": 0.0, + "eval_loss": 0.034576416015625, + "eval_runtime": 3914.1663, + "eval_samples_per_second": 287.347, + "eval_steps_per_second": 17.959, + "step": 400000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 400100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 400200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 400300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 400400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 400500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 400600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 400700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 400800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 400900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 401000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 401100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 401200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 401300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 401400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 401500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 401600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 401700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 401800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 401900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 402000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 402100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 402200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 402300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 402400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 402500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 402600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 402700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 402800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 402900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 403000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 403100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 403200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 403300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 403400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 403500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 403600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 403700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 403800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 403900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 404000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 404100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 404200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 404300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 404400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 404500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 404600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 404700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 404800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 404900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 405000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 405100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 405200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 405300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 405400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 405500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 405600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 405700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 405800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 405900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 406000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 406100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 406200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 406300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 406400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 406500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 406600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 406700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 406800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 406900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 407000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 407100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 407200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 407300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 407400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 407500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 407600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 407700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 407800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 407900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 408000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 408100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 408200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 408300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 408400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 408500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 408600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 408700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 408800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 408900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 409000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 409100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 409200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 409300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 409400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 409500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 409600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 409700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 409800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 409900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 410000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 410100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 410200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 410300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 410400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 410500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 410600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 410700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 410800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 410900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 411000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 411100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 411200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 411300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 411400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 411500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 411600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 411700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 411800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 411900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 412000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 412100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 412200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 412300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 412400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 412500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 412600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 412700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 412800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 412900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 413000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 413100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 413200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 413300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 413400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 413500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 413600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 413700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 413800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 413900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 414000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 414100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 414200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 414300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 414400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 414500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 414600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 414700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 414800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 414900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 415000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 415100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 415200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 415300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 415400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 415500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 415600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 415700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 415800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 415900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 416000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 416100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 416200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 416300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 416400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0417, + "step": 416500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 416600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 416700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 416800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 416900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 417000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 417100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 417200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 417300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 417400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 417500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 417600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 417700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 417800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 417900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 418000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 418100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 418200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 418300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 418400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 418500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 418600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 418700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 418800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 418900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 419000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 419100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 419200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 419300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 419400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 419500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 419600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 419700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 419800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 419900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 420000 + }, + { + "epoch": 0.0, + "eval_loss": 0.033843994140625, + "eval_runtime": 3662.1217, + "eval_samples_per_second": 307.123, + "eval_steps_per_second": 19.195, + "step": 420000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 420100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 420200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 420300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 420400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 420500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 420600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 420700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 420800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 420900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 421000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 421100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 421200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 421300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 421400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 421500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 421600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 421700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 421800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 421900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 422000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 422100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 422200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 422300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 422400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 422500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 422600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 422700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 422800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 422900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 423000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 423100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 423200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 423300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 423400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 423500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 423600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 423700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 423800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 423900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 424000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 424100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 424200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 424300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 424400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 424500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 424600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 424700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 424800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 424900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 425000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 425100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 425200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 425300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 425400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 425500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 425600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 425700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 425800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 425900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 426000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 426100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 426200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 426300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 426400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 426500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 426600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 426700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 426800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 426900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 427000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 427100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 427200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 427300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 427400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 427500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 427600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 427700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 427800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 427900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 428000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 428100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 428200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 428300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 428400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 428500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 428600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 428700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 428800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 428900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 429000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 429100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 429200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 429300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 429400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 429500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 429600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 429700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 429800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 429900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 430000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 430100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 430200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 430300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 430400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 430500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 430600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 430700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 430800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 430900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 431000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 431100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 431200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 431300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 431400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 431500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 431600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 431700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 431800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 431900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 432000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 432100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 432200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 432300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 432400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 432500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 432600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 432700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 432800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 432900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 433000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 433100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 433200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 433300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 433400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 433500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 433600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 433700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 433800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 433900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 434000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 434100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 434200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 434300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 434400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 434500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 434600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 434700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 434800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 434900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 435000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 435100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 435200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 435300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 435400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 435500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 435600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 435700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 435800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 435900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 436000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 436100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 436200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 436300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 436400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 436500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 436600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 436700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 436800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 436900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 437000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 437100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 437200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 437300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 437400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 437500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 437600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 437700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 437800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 437900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 438000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 438100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 438200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 438300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 438400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 438500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 438600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 438700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 438800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 438900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 439000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 439100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 439200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 439300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 439400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 439500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 439600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 439700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 439800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 439900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 440000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03411865234375, + "eval_runtime": 4314.8615, + "eval_samples_per_second": 260.663, + "eval_steps_per_second": 16.292, + "step": 440000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 440100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 440200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 440300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 440400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 440500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 440600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 440700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 440800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 440900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 441000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 441100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 441200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 441300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 441400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 441500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 441600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 441700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 441800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 441900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 442000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 442100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 442200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 442300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 442400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 442500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 442600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 442700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 442800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 442900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 443000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 443100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 443200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 443300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 443400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 443500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 443600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 443700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 443800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 443900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 444000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 444100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 444200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 444300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 444400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 444500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 444600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 444700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 444800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 444900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 445000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 445100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 445200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 445300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 445400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 445500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 445600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 445700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 445800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 445900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 446000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 446100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 446200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 446300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 446400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 446500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 446600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 446700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 446800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 446900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 447000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 447100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 447200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 447300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 447400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 447500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 447600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 447700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 447800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 447900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 448000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 448100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 448200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 448300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 448400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 448500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 448600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 448700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 448800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 448900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 449000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 449100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 449200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 449300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 449400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 449500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 449600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 449700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 449800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 449900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 450000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 450100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 450200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 450300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 450400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 450500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 450600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 450700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0406, + "step": 450800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 450900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 451000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 451100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 451200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 451300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 451400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 451500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 451600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 451700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 451800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 451900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 452000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 452100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 452200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 452300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 452400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 452500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 452600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 452700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 452800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 452900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 453000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 453100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 453200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 453300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 453400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 453500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 453600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 453700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 453800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 453900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 454000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 454100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 454200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 454300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 454400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 454500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 454600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 454700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 454800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 454900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 455000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 455100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 455200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 455300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 455400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 455500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 455600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 455700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 455800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 455900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 456000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 456100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 456200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 456300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 456400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 456500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 456600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 456700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 456800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 456900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 457000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 457100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 457200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 457300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 457400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 457500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 457600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 457700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 457800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 457900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 458000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 458100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 458200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 458300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 458400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 458500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 458600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 458700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 458800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 458900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 459000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 459100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 459200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 459300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 459400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 459500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 459600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 459700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 459800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 459900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 460000 + }, + { + "epoch": 0.0, + "eval_loss": 0.033843994140625, + "eval_runtime": 4130.1582, + "eval_samples_per_second": 272.32, + "eval_steps_per_second": 17.02, + "step": 460000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 460100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 460200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 460300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 460400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 460500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 460600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 460700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 460800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 460900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 461000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 461100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 461200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 461300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 461400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 461500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 461600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 461700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 461800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 461900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 462000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 462100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 462200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 462300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 462400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 462500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 462600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 462700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 462800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 462900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 463000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 463100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 463200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 463300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 463400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 463500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 463600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 463700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 463800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 463900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 464000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 464100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 464200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 464300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 464400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 464500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 464600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 464700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 464800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 464900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 465000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 465100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 465200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 465300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 465400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 465500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 465600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 465700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 465800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 465900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 466000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 466100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 466200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 466300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 466400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 466500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 466600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 466700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 466800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 466900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 467000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 467100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 467200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 467300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 467400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 467500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 467600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 467700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 467800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 467900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 468000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 468100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 468200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 468300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 468400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 468500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 468600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 468700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 468900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 469000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 469100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 469200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 469300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 469400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 469500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 469600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 469700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 469800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 469900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 470000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 470100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 470200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 470300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 470400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 470500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 470600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 470700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 470800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 470900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 471000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 471100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 471200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 471300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 471400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 471500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 471600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 471700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 471800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 471900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 472000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 472100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 472200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 472300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 472400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 472500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 472600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0408, + "step": 472700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 472800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 472900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 473000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 473100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 473200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 473300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 473400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 473500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 473600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 473700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 473800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 473900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 474000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 474100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 474200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 474300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 474400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 474500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 474600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 474700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 474800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 474900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 475000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 475100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 475200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 475300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 475400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 475500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 475600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 475700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 475800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 475900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 476000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 476100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 476200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 476300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 476400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 476500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 476600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 476700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 476800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 476900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 477000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 477100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 477200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 477300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 477400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 477500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 477600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 477700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 477800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0415, + "step": 477900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 478000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 478100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 478200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 478300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 478400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 478500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 478600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 478700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 478800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 478900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 479000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 479100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 479200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 479300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 479400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 479500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 479600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 479700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 479800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 479900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 480000 + }, + { + "epoch": 0.0, + "eval_loss": 0.033477783203125, + "eval_runtime": 2992.0731, + "eval_samples_per_second": 375.901, + "eval_steps_per_second": 23.494, + "step": 480000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 480100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 480200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 480300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 480400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 480500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 480600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 480700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 480800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 480900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 481000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 481100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 481200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 481300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 481400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 481500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 481600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 481700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 481800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 481900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 482000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 482100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 482200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 482300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 482400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 482500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 482600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 482700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 482800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 482900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 483000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 483100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 483200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 483300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 483400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 483500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 483600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 483700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 483800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 483900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 484000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 484100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 484200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 484300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 484400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 484500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 484600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 484700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 484800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 484900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 485000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 485100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 485200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 485300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 485400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 485500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 485600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 485700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 485800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 485900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 486000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 486100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 486200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 486300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.041, + "step": 486400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 486500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 486600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 486700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 486800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 486900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 487000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 487100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 487200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 487300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 487400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 487500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 487600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 487700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 487800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 487900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 488000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 488100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 488200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 488300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 488400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 488500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 488600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 488700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 488800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 488900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 489000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 489100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 489200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 489300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 489400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 489500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 489600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 489700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 489800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 489900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 490000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 490100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 490200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 490300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 490400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 490500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 490600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 490700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 490800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 490900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 491000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 491100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 491200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 491300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 491400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 491500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 491600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 491700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 491800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 491900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 492000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 492100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 492200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 492300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 492400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 492500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 492600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 492700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 492800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 492900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 493000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 493100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 493200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 493300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 493400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 493500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 493600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 493700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 493800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 493900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 494000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 494100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 494200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 494300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 494400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 494500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 494600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 494700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 494800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 494900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 495000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 495100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 495200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 495300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 495400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 495500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 495600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 495700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 495800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 495900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 496000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 496100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 496200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 496300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 496400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 496500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 496600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 496700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 496800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 496900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 497000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 497100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 497200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 497300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 497400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 497500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 497600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 497700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 497800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 497900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 498000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 498100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 498200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 498300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 498400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 498500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 498600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 498700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 498800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 498900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 499000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 499100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 499200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 499300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 499400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 499500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 499600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 499700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 499800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 499900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 500000 + }, + { + "epoch": 0.0, + "eval_loss": 0.033447265625, + "eval_runtime": 3057.7078, + "eval_samples_per_second": 367.832, + "eval_steps_per_second": 22.99, + "step": 500000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 500100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 500200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 500300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 500400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 500500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 500600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 500700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 500800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 500900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 501000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 501100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 501200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 501300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 501400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 501500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 501600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 501700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 501800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 501900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 502000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 502100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 502200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 502300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 502400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 502500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 502600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 502700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 502800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 502900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 503000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 503100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 503200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 503300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 503400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 503500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 503600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 503700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 503800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 503900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 504000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 504100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 504200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 504300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 504400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 504500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 504600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 504700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 504800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 504900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 505000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 505100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 505200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 505300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 505400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 505500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 505600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 505700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 505800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 505900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 506000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 506100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 506200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 506300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 506400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 506500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 506600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 506700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 506800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 506900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 507000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 507100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 507200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 507300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 507400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 507500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 507600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 507700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 507800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 507900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 508000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 508100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 508200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 508300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 508400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 508500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 508600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 508700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 508800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 508900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 509000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 509100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 509200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 509300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 509400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 509500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 509600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 509700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 509800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 509900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 510000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 510100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 510200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 510300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 510400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 510500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 510600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 510700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 510800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 510900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 511000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 511100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 511200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 511300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 511400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 511500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 511600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 511700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 511800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 511900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 512000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 512100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 512200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 512300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 512400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 512500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 512600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 512700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 512800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 512900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 513000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 513100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 513200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 513300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 513400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 513500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 513600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 513700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 513800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 513900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 514000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 514100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 514200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 514300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 514400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 514500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 514600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 514700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 514800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 514900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 515000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 515100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 515200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 515300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 515400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 515500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 515600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 515700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 515800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 515900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 516000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 516100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 516200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 516300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 516400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 516500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 516600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 516700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 516800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 516900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 517000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 517100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 517200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 517300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 517400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 517500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 517600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 517700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 517800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 517900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 518000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 518100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 518200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 518300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 518400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 518500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0419, + "step": 518600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 518700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 518800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 518900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 519000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 519100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 519200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 519300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 519400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 519500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 519600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 519700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 519800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 519900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 520000 + }, + { + "epoch": 0.0, + "eval_loss": 0.032989501953125, + "eval_runtime": 3122.3006, + "eval_samples_per_second": 360.223, + "eval_steps_per_second": 22.514, + "step": 520000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 520100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 520200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 520300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 520400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 520500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 520600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 520700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 520800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 520900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 521000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 521100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 521200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 521300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 521400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 521500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 521600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 521700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 521800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 521900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 522000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 522100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 522200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 522300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 522400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 522500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 522600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 522700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 522800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 522900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 523000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 523100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 523200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 523300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 523400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 523500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 523600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 523700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 523800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 523900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 524000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 524100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 524200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 524300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 524400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 524500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 524600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 524700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 524800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 524900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 525000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 525100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 525200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 525300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 525400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 525500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 525600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 525700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 525800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 525900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 526000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 526100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 526200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 526300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 526400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 526500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 526600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 526700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 526800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 526900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 527000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 527100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 527200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 527300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 527400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 527500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 527600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 527700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 527800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 527900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 528000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 528100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 528200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 528300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 528400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 528500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 528600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0424, + "step": 528700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 528800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 528900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 529000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 529100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 529200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 529300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 529400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 529500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 529600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 529700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 529800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 529900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 530000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 530100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 530200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 530300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 530400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 530500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 530600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 530700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 530800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 530900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 531000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 531100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 531200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 531300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 531400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 531500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 531600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 531700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 531800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 531900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 532000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 532100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 532200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 532300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 532400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 532500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 532600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 532700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 532800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 532900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 533000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 533100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 533200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 533300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 533400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 533500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 533600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 533700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 533800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 533900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 534000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 534100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 534200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 534300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 534400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 534500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 534600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 534700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 534800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 534900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 535000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 535100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 535200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 535300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 535400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 535500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 535600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 535700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 535800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 535900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 536000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 536100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 536200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 536300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 536400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 536500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 536600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 536700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 536800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 536900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 537000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 537100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 537200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 537300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 537400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 537500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 537600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 537700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 537800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 537900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 538000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 538100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 538200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 538300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 538400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 538500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 538600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 538700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 538800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 538900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 539000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 539100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 539200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 539300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 539400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 539500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 539600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 539700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 539800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 539900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 540000 + }, + { + "epoch": 0.0, + "eval_loss": 0.032684326171875, + "eval_runtime": 3257.3593, + "eval_samples_per_second": 345.287, + "eval_steps_per_second": 21.581, + "step": 540000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 540100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 540200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 540300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 540400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 540500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 540600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 540700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 540800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 540900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 541000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 541100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 541200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 541300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 541400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 541500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 541600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 541700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 541800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 541900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 542000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 542100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 542200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 542300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 542400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 542500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 542600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 542700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 542800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 542900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 543000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 543100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 543200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 543300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 543400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 543500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 543600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 543700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 543800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 543900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 544000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 544100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 544200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 544300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 544400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 544500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 544600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 544700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 544800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 544900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 545000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 545100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 545200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 545300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 545400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 545500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 545600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 545700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 545800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 545900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 546000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 546100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 546200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 546300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 546400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 546500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 546600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 546700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 546800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 546900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 547000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 547100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 547200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 547300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 547400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 547500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 547600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 547700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 547800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 547900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 548000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 548100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 548200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 548300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 548400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 548500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 548600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 548700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 548800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 548900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 549000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 549100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 549200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 549300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 549400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 549500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 549600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 549700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 549800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 549900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 550000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 550100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 550200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 550300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 550400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 550500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 550600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 550700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 550800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 550900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 551000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 551100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 551200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 551300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 551400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 551500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 551600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 551700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 551800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 551900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 552000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 552100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 552200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 552300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 552400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 552500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 552600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 552700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 552800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 552900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 553000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 553100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 553200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 553300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 553400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 553500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 553600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 553700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 553800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 553900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 554000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 554100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 554200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 554300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 554400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 554500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 554600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 554700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 554800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 554900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 555000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 555100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 555200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 555300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 555400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 555500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 555600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 555700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 555800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 555900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 556000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 556100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 556200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 556300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 556400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 556500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 556600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 556700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 556800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 556900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 557000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 557100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 557200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 557300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 557400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 557500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 557600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 557700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 557800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 557900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 558000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 558100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 558200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 558300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 558400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 558500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 558600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 558700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 558800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 558900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 559000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 559100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 559200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 559300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 559400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 559500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 559600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 559700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 559800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 559900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 560000 + }, + { + "epoch": 0.0, + "eval_loss": 0.032989501953125, + "eval_runtime": 3191.314, + "eval_samples_per_second": 352.433, + "eval_steps_per_second": 22.027, + "step": 560000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 560100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 560200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 560300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 560400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 560500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 560600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 560700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 560800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 560900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 561000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 561100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 561200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 561300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 561400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 561500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 561600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 561700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 561800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 561900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 562000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 562100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 562200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 562300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 562400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 562500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 562600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 562700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 562800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 562900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 563000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 563100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 563200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 563300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 563400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 563500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 563600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 563700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 563800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 563900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 564000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 564100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 564200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 564300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 564400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 564500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 564600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 564700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 564800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 564900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 565000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 565100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 565200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 565300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 565400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 565500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 565600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 565700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 565800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 565900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 566000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 566100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 566200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 566300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 566400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 566500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 566600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 566700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 566800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 566900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 567000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 567100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 567200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 567300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 567400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 567500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 567600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 567700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 567800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 567900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 568000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 568100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 568200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 568300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 568400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 568500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 568600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 568700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 568800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 568900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 569000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 569100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 569200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 569300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 569400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 569500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 569600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 569700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 569800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 569900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 570000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 570100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 570200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 570300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 570400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 570500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 570600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 570700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 570800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 570900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 571000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 571100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 571200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 571300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 571400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 571500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 571600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 571700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 571800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 571900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 572000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 572100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 572200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 572300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 572400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 572500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 572600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 572700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 572800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 572900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 573000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 573100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 573200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 573300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 573400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 573500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 573600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 573700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 573800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 573900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 574000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 574100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 574200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 574300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 574400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 574500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 574600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 574700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 574800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 574900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 575000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 575100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 575200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 575300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 575400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 575500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 575600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 575700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 575800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 575900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 576000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 576100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 576200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 576300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 576400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 576500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 576600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 576700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 576800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 576900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 577000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 577100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 577200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 577300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 577400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 577500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 577600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 577700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 577800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 577900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 578000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 578100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 578200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 578300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 578400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 578500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 578600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 578700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 578800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 578900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 579000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 579100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 579200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 579300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 579400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 579500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 579600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 579700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 579800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 579900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 580000 + }, + { + "epoch": 0.0, + "eval_loss": 0.032958984375, + "eval_runtime": 3246.6947, + "eval_samples_per_second": 346.421, + "eval_steps_per_second": 21.652, + "step": 580000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 580100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 580200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 580300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 580400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 580500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 580600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 580700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 580800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 580900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 581000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 581100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 581200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 581300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 581400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 581500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 581600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 581700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 581800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 581900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 582000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 582100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 582200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 582300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 582400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 582500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 582600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 582700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 582800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 582900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 583000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 583100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 583200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 583300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 583400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 583500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 583600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 583700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 583800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 583900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 584000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 584100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 584200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 584300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 584400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 584500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 584600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 584700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 584800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 584900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 585000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 585100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 585200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 585300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 585400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 585500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 585600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 585700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 585800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 585900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 586000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 586100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 586200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 586300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 586400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 586500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 586600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 586700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 586800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 586900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 587000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 587100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 587200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 587300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 587400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 587500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0395, + "step": 587600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 587700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 587800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 587900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 588000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 588100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 588200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 588300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 588400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 588500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 588600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 588700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 588800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 588900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 589000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 589100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 589200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 589300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 589400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 589500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 589600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 589700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 589800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 589900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 590000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 590100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 590200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 590300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 590400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 590500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 590600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 590700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 590800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 590900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 591000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 591100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 591200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 591300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 591400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 591500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 591600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 591700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 591800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 591900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 592000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 592100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 592200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 592300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 592400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 592500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 592600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 592700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 592800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 592900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 593000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 593100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 593200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 593300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 593400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 593500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 593600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 593700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 593800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 593900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 594000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 594100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 594200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 594300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 594400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 594500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 594600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 594700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 594800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 594900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 595000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 595100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 595200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 595300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 595400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 595500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 595600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 595700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 595800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 595900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 596000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 596100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 596200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 596300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 596400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 596500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 596600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 596700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 596800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 596900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 597000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 597100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 597200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 597300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 597400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 597500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 597600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 597700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 597800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 597900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 598000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 598100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 598200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 598300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 598400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 598500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 598600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 598700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 598800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 598900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 599000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 599100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 599200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 599300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 599400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 599500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 599600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 599700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 599800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 599900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 600000 + }, + { + "epoch": 0.0, + "eval_loss": 0.032318115234375, + "eval_runtime": 3276.9359, + "eval_samples_per_second": 343.224, + "eval_steps_per_second": 21.452, + "step": 600000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 600100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 600200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 600300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 600400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 600500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 600600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 600700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 600800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 600900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 601000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 601100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 601200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 601300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 601400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 601500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 601600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 601700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 601800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 601900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 602000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 602100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 602200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 602300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 602400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 602500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 602600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 602700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 602800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 602900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 603000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 603100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 603200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 603300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 603400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 603500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 603600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 603700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 603800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 603900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 604000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 604100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 604200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 604300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 604400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 604500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 604600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 604700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 604800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 604900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 605000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 605100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 605200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 605300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 605400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 605500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 605600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 605700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 605800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 605900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 606000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 606100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 606200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 606300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 606400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 606500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 606600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 606700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 606800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 606900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 607000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 607100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 607200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 607300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 607400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 607500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 607600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 607700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 607800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 607900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 608000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 608100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 608200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 608300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 608400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 608500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 608600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 608700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 608800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 608900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 609000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 609100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 609200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 609300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 609400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 609500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 609600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 609700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 609800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 609900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 610000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 610100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 610200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 610300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 610400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 610500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 610600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 610700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 610800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 610900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 611000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 611100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 611200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 611300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 611400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 611500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 611600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 611700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 611800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 611900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 612000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 612100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 612200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 612300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 612400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 612500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 612600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 612700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 612800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 612900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 613000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 613100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 613200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 613300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 613400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 613500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 613600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 613700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 613800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 613900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 614000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 614100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 614200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 614300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 614400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 614500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 614600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 614700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 614800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 614900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 615000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 615100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 615200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 615300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 615400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 615500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 615600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 615700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 615800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 615900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 616000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 616100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 616200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 616300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 616400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 616500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 616600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 616700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 616800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 616900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 617000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 617100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 617200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 617300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 617400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 617500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 617600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 617700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 617800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 617900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 618000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 618100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 618200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 618300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 618400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 618500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 618600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 618700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 618800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 618900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 619000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 619100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 619200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 619300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 619400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 619500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 619600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 619700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 619800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 619900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 620000 + }, + { + "epoch": 0.0, + "eval_loss": 0.032318115234375, + "eval_runtime": 3169.2624, + "eval_samples_per_second": 354.885, + "eval_steps_per_second": 22.181, + "step": 620000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 620100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 620200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 620300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 620400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 620500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 620600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 620700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 620800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 620900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 621000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 621100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 621200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 621300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 621400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 621500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 621600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 621700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 621800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 621900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 622000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 622100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 622200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 622300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 622400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 622500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 622600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 622700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 622800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 622900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 623000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 623100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 623200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 623300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 623400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 623500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 623600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 623700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 623800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 623900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 624000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 624100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 624200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 624300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 624400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 624500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 624600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 624700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 624800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 624900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 625000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 625100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 625200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 625300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 625400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 625500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 625600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 625700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 625800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 625900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 626000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 626100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 626200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 626300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 626400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 626500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 626600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 626700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 626800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 626900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 627000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 627100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 627200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 627300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 627400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 627500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 627600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 627700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 627800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 627900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 628000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 628100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 628200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 628300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 628400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 628500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 628600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 628700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 628800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 628900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 629000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 629100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 629200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 629300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 629400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 629500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 629600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 629700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 629800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 629900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 630000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 630100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 630200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 630300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 630400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 630500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 630600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 630700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 630800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 630900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 631000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 631100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 631200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 631300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 631400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 631500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 631600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 631700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 631800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 631900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 632000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 632100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 632200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 632300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 632400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 632500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 632600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 632700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 632800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 632900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 633000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 633100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 633200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 633300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 633400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 633500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 633600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 633700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 633800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 633900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 634000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 634100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 634200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 634300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 634400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 634500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 634600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 634700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 634800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 634900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 635000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 635100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 635200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 635300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 635400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 635500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 635600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 635700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 635800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 635900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 636000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 636100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 636200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 636300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 636400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 636500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 636600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 636700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 636800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 636900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 637000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 637100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 637200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 637300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 637400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 637500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 637600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 637700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 637800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 637900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 638000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 638100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 638200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 638300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 638400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 638500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 638600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 638700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 638800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 638900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 639000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 639100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 639200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 639300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 639400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 639500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 639600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 639700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 639800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 639900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 640000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03216552734375, + "eval_runtime": 3040.9401, + "eval_samples_per_second": 369.86, + "eval_steps_per_second": 23.117, + "step": 640000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 640100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 640200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 640300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 640400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 640500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 640600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 640700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 640800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 640900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 641000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 641100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 641200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 641300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 641400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 641500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 641600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 641700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 641800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 641900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 642000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 642100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 642200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 642300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 642400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 642500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 642600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 642700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 642800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 642900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 643000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 643100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 643200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 643300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 643400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 643500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 643600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 643700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 643800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 643900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 644000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 644100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 644200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 644300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 644400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 644500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 644600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 644700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 644800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 644900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 645000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 645100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 645200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 645300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 645400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 645500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 645600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 645700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 645800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 645900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 646000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 646100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 646200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 646300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 646400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 646500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 646600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 646700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 646800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 646900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 647000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 647100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 647200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 647300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 647400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 647500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 647600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 647700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 647800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 647900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 648000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 648100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 648200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 648300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 648400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 648500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 648600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 648700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 648800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 648900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 649000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 649100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 649200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 649300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 649400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 649500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 649600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 649700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 649800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 649900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 650000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 650100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 650200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 650300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 650400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 650500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 650600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 650700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 650800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 650900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 651000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 651100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 651200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 651300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 651400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 651500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 651600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 651700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 651800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 651900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 652000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 652100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 652200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 652300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 652400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 652500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 652600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 652700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 652800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 652900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 653000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 653100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 653200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 653300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 653400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 653500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 653600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 653700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 653800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 653900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 654000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 654100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 654200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 654300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 654400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 654500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 654600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 654700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 654800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 654900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 655000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 655100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 655200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 655300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 655400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 655500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 655600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 655700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 655800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 655900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 656000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 656100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 656200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 656300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 656400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 656500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 656600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 656700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 656800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 656900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 657000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 657100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 657200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 657300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 657400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 657500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 657600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 657700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 657800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 657900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 658000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 658100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 658200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 658300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 658400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 658500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 658600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 658700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 658800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 658900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 659000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 659100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 659200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 659300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 659400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 659500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 659600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 659700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 659800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 659900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 660000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03192138671875, + "eval_runtime": 3088.6861, + "eval_samples_per_second": 364.143, + "eval_steps_per_second": 22.759, + "step": 660000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 660100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 660200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 660300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 660400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 660500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 660600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 660700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 660800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 660900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 661000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 661100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 661200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 661300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 661400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 661500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 661600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 661700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 661800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 661900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 662000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 662100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 662200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 662300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 662400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 662500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 662600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 662700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 662800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 662900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 663000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 663100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 663200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 663300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 663400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 663500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 663600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 663700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 663800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 663900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 664000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 664100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 664200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 664300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 664400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 664500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 664600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 664700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 664800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 664900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 665000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 665100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 665200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 665300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 665400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 665500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 665600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 665700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 665800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 665900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 666000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 666100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 666200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 666300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 666400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 666500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 666600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 666700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 666800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 666900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 667000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 667100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 667200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 667300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 667400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 667500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 667600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 667700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 667800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 667900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 668000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 668100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 668200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 668300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 668400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 668500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 668600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 668700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 668800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 668900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 669000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 669100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 669200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 669300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 669400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 669500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 669600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 669700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 669800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 669900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 670000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 670100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 670200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 670300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 670400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 670500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 670600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 670700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 670800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 670900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 671000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 671100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 671200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 671300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 671400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 671500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 671600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 671700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 671800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 671900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 672000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 672100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 672200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 672300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 672400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 672500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 672600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 672700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 672800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 672900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 673000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 673100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 673200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 673300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 673400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 673500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 673600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 673700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 673800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 673900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 674000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 674100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 674200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 674300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 674400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 674500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 674600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 674700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 674800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 674900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 675000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 675100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 675200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 675300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 675400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 675500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 675600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 675700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 675800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 675900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 676000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 676100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 676200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 676300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 676400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 676500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 676600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 676700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 676800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 676900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 677000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 677100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 677200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 677300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 677400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 677500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 677600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 677700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 677800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 677900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 678000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0393, + "step": 678100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 678200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 678300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 678400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 678500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 678600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 678700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 678800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 678900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 679000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 679100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 679200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 679300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 679400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 679500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 679600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 679700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 679800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 679900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 680000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03179931640625, + "eval_runtime": 3153.5498, + "eval_samples_per_second": 356.653, + "eval_steps_per_second": 22.291, + "step": 680000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 680100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 680200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 680300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 680400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 680500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 680600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 680700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 680800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 680900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 681000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 681100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 681200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 681300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 681400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 681500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 681600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 681700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 681800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 681900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 682000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 682100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 682200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 682300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 682400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 682500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 682600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 682700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 682800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 682900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 683000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 683100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 683200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 683300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 683400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 683500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 683600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 683700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 683800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 683900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 684000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 684100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 684200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 684300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 684400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 684500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 684600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 684700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 684800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 684900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 685000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 685100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 685200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 685300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 685400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 685500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 685600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 685700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 685800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 685900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 686000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 686100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 686200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 686300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 686400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 686500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 686600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 686700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 686800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 686900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 687000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 687100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 687200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 687300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 687400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 687500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 687600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 687700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 687800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 687900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 688000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 688100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 688200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 688300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 688400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 688500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 688600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 688700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 688800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 688900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 689000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 689100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 689200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 689300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 689400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 689500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 689600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 689700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 689800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 689900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 690000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 690100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 690200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 690300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 690400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 690500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 690600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 690700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 690800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 690900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 691000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 691100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 691200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 691300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 691400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 691500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 691600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 691700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 691800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 691900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 692000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 692100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 692200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 692300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 692400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 692500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 692600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 692700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 692800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 692900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 693000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 693100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 693200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 693300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 693400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 693500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 693600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 693700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 693800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 693900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 694000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 694100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 694200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 694300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 694400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 694500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 694600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 694700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 694800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 694900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 695000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 695100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 695200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 695300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 695400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 695500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 695600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 695700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 695800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 695900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 696000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 696100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 696200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 696300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 696400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 696500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 696600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 696700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 696800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 696900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 697000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 697100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 697200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 697300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 697400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 697500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 697600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 697700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 697800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 697900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 698000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 698100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 698200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 698300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 698400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 698500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 698600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 698700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 698800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 698900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 699000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 699100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 699200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 699300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 699400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 699500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 699600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 699700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 699800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 699900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 700000 + }, + { + "epoch": 0.0, + "eval_loss": 0.032257080078125, + "eval_runtime": 3135.6458, + "eval_samples_per_second": 358.689, + "eval_steps_per_second": 22.418, + "step": 700000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 700100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 700200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 700300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 700400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 700500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 700600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 700700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 700800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 700900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 701000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 701100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 701200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 701300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 701400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 701500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 701600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 701700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 701800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 701900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 702000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 702100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 702200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 702300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 702400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 702500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 702600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 702700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 702800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 702900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 703000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 703100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 703200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 703300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 703400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 703500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 703600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 703700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 703800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 703900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 704000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 704100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 704200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 704300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 704400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 704500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 704600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 704700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 704800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 704900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 705000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 705100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 705200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 705300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 705400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 705500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 705600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 705700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 705800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 705900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 706000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 706100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 706200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 706300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 706400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 706500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 706600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 706700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 706800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 706900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 707000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 707100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 707200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 707300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 707400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 707500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 707600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 707700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 707800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 707900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 708000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 708100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 708200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 708300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 708400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 708500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 708600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 708700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 708800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 708900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 709000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 709100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 709200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 709300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 709400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 709500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 709600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 709700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 709800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 709900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 710000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 710100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 710200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 710300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 710400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 710500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 710600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 710700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 710800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 710900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 711000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 711100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 711200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 711300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 711400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 711500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 711600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 711700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 711800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 711900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 712000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 712100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 712200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 712300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 712400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 712500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 712600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 712700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 712800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 712900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 713000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 713100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 713200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 713300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 713400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 713500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 713600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 713700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 713800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 713900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 714000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 714100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 714200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 714300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 714400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 714500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 714600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 714700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 714800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 714900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 715000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 715100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 715200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 715300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 715400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 715500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 715600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 715700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 715800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 715900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 716000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 716100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 716200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 716300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 716400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 716500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 716600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 716700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 716800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 716900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 717000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 717100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 717200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 717300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 717400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 717500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 717600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 717700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 717800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 717900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 718000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 718100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 718200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 718300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 718400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 718500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 718600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 718700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 718800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 718900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 719000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 719100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 719200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 719300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 719400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 719500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 719600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 719700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 719800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 719900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 720000 + }, + { + "epoch": 0.0, + "eval_loss": 0.031890869140625, + "eval_runtime": 3100.2132, + "eval_samples_per_second": 362.789, + "eval_steps_per_second": 22.675, + "step": 720000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 720100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 720200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 720300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 720400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 720500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 720600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 720700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 720800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 720900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 721000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 721100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 721200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 721300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 721400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 721500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 721600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 721700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 721800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 721900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 722000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 722100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 722200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 722300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 722400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 722500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 722600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 722700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 722800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 722900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 723000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 723100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 723200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 723300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 723400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 723500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 723600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 723700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 723800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 723900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 724000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 724100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 724200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 724300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 724400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 724500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 724600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 724700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 724800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 724900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 725000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 725100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 725200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 725300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 725400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 725500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 725600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 725700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 725800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 725900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 726000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 726100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 726200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 726300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 726400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 726500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 726600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 726700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 726800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 726900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 727000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 727100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 727200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 727300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 727400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 727500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 727600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 727700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 727800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 727900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 728000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 728100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 728200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 728300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 728400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 728500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 728600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 728700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 728800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 728900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 729000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 729100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 729200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 729300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 729400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 729500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 729600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 729700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 729800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 729900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 730000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 730100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 730200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 730300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 730400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 730500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 730600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 730700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 730800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 730900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 731000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 731100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 731200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 731300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 731400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 731500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 731600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 731700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 731800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 731900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 732000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 732100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 732200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 732300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 732400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 732500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 732600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 732700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 732800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 732900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 733000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 733100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 733200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 733300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 733400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 733500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 733600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 733700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 733800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 733900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 734000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 734100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 734200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 734300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 734400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 734500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 734600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 734700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 734800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 734900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 735000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 735100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 735200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 735300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 735400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 735500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 735600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 735700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 735800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 735900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 736000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 736100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 736200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 736300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 736400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 736500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 736600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 736700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 736800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 736900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 737000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 737100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 737200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 737300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 737400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 737500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 737600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 737700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 737800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 737900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 738000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 738100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 738200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 738300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 738400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 738500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 738600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 738700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 738800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 738900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 739000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 739100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 739200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 739300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 739400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 739500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 739600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 739700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 739800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 739900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 740000 + }, + { + "epoch": 0.0, + "eval_loss": 0.031463623046875, + "eval_runtime": 3193.1308, + "eval_samples_per_second": 352.232, + "eval_steps_per_second": 22.015, + "step": 740000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 740100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 740200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 740300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 740400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 740500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 740600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 740700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 740800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 740900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 741000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 741100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 741200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 741300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 741400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 741500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 741600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 741700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 741800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 741900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 742000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 742100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 742200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 742300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 742400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 742500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 742600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 742700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 742800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 742900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 743000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 743100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 743200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 743300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 743400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 743500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 743600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 743700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 743800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 743900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 744000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 744100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 744200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 744300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 744400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 744500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 744600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 744700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 744800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 744900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 745000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 745100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 745200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 745300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 745400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 745500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 745700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 745800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 745900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 746000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 746100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 746200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 746300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 746400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 746500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 746600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 746700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 746800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 746900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 747000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 747100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 747200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 747300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 747400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 747500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 747600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 747700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 747800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 747900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 748000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 748100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 748200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 748300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 748400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 748500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 748600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 748700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 748800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 748900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 749000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 749100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 749200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 749300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 749400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 749500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 749600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 749700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 749800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 749900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 750000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 750100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 750200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 750300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 750400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 750500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 750600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 750700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 750800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 750900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 751000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 751100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 751200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 751300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 751400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 751500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 751600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 751700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 751800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 751900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 752000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 752100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 752200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 752300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 752400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 752500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 752600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 752700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 752800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 752900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 753000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 753100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 753200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 753300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 753400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 753500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 753600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 753700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 753800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 753900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 754000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 754100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 754200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 754300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 754400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 754500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 754600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 754700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 754800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 754900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 755000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 755100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 755200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 755300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 755400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 755500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 755600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 755700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 755800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 755900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 756000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 756100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 756200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 756300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 756400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 756500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 756600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 756700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 756800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 756900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 757000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 757100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 757200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 757300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 757400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 757500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 757600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 757700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 757800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 757900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 758000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 758100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 758200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 758300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 758400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 758500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 758600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 758700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 758800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 758900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 759000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 759100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 759200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 759300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 759400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 759500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 759600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 759700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 759800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 759900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 760000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03131103515625, + "eval_runtime": 3049.8434, + "eval_samples_per_second": 368.781, + "eval_steps_per_second": 23.049, + "step": 760000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 760100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 760200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 760300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 760400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 760500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 760600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 760700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 760800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 760900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 761000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 761100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 761200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 761300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 761400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 761500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 761600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 761700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 761800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 761900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 762000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 762100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 762200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 762300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 762400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 762500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 762600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 762700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 762800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 762900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 763000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 763100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 763200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 763300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 763400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 763500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 763600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 763700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 763800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 763900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 764000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 764100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 764200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 764300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 764400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 764500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 764600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 764700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 764800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 764900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 765000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 765100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 765200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 765300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 765400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 765500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 765600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 765700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 765800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 765900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 766000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 766100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 766200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 766300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 766400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 766500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 766600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 766700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 766800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 766900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 767000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 767100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 767200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 767300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 767400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 767500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 767600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 767700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 767800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 767900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 768000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 768100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 768200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 768300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 768400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 768500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 768600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 768700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 768800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 768900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 769000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 769100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 769200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 769300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 769400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 769500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 769600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 769700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 769800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 769900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 770000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 770100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 770200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 770300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 770400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 770500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 770600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 770700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 770800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 770900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 771000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 771100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 771200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 771300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 771400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 771500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 771600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 771700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 771800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 771900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 772000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 772100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 772200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 772300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 772400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 772500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 772600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 772700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 772800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 772900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 773000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 773100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 773200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 773300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 773400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 773500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 773600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 773700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 773800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 773900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 774000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 774100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 774200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 774300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 774400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 774500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 774600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 774700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 774800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 774900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 775000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 775100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 775200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 775300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 775400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 775500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 775600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 775700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 775800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 775900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 776000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 776100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 776200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 776300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 776400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 776500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 776600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 776700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 776800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 776900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 777000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 777100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 777200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 777300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 777400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 777500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 777600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 777700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 777800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 777900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 778000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 778100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 778200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 778300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 778400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 778500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 778600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 778700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 778800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 778900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 779000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 779100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 779200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 779300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 779400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 779500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 779600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 779700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 779800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 779900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 780000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0312347412109375, + "eval_runtime": 3485.0614, + "eval_samples_per_second": 322.727, + "eval_steps_per_second": 20.171, + "step": 780000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 780100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 780200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 780300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 780400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 780500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 780600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 780700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 780800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 780900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 781000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 781100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 781200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 781300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 781400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 781500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 781600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 781700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 781800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 781900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 782000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 782100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 782200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 782300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 782400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 782500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 782600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 782700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 782800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 782900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 783000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 783100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 783200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 783300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 783400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 783500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 783600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 783700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 783800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 783900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 784000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 784100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 784200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 784300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 784400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 784500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 784600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 784700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 784800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 784900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 785000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 785100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 785200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 785300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 785400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 785500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 785600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 785700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 785800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 785900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 786000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 786100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 786200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 786300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 786400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 786500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 786600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 786700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 786800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 786900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 787000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 787100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 787200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 787300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 787400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 787500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 787600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 787700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 787800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 787900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 788000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 788100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 788200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 788300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 788400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 788500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 788600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 788700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 788800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 788900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 789000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 789100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 789200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 789300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 789400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 789500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 789600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 789700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 789800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 789900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 790000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 790100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 790200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 790300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 790400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 790500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 790600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 790700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 790800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 790900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 791000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 791100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 791200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 791300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 791400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 791500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 791600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 791700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 791800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 791900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 792000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 792100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 792200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 792300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 792400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 792500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 792600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 792700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 792800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 792900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 793000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 793100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 793200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 793300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 793400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 793500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 793600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 793700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 793800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 793900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 794000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 794100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 794200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 794300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 794400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 794500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 794600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 794700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 794800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 794900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 795000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 795100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 795200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 795300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 795400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 795500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 795600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 795700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 795800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 795900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 796000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 796100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 796200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 796300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 796400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 796500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 796600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 796700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 796800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 796900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 797000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 797100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 797200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 797300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 797400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 797500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 797600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 797700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 797800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 797900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 798000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 798100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 798200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 798300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 798400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 798500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 798600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 798700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 798800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 798900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 799000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 799100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 799200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 799300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 799400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 799500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 799600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 799700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 799800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 799900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 800000 + }, + { + "epoch": 0.0, + "eval_loss": 0.031463623046875, + "eval_runtime": 3435.6458, + "eval_samples_per_second": 327.369, + "eval_steps_per_second": 20.461, + "step": 800000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 800100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 800200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 800300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 800400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 800500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 800600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 800700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 800800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 800900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 801000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 801100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 801200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 801300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 801400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 801500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 801600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 801700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 801800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 801900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 802000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 802100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 802200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 802300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 802400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 802500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 802600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 802700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 802800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 802900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 803000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 803100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 803200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 803300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 803400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 803500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 803600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 803700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 803800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 803900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 804000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 804100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 804200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 804300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 804400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 804500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 804600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 804700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 804800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 804900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 805000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 805100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 805200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 805300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 805400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 805500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 805600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 805700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 805800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 805900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 806000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 806100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 806200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 806300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 806400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 806500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 806600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 806700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 806800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 806900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 807000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 807100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 807200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 807300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 807400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 807500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 807600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 807700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 807800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 807900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 808000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 808100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 808200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 808300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 808400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 808500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 808600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 808700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 808800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 808900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 809000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 809100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 809200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 809300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 809400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 809500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 809600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 809700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 809800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 809900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 810000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 810100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 810200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 810300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 810400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 810500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 810600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 810700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 810800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 810900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 811000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 811100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 811200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 811300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 811400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 811500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 811600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 811700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 811800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 811900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 812000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 812100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 812200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 812300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 812400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 812500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 812600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 812700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 812800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 812900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 813000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 813100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 813200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 813300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 813400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 813500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 813600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 813700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 813800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 813900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 814000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 814100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 814200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 814300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 814400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 814500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 814600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 814700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 814800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 814900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 815000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 815100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 815200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 815300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 815400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 815500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 815600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 815700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 815800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 815900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 816000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 816100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 816200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 816300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 816400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 816500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 816600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 816700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 816800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 816900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 817000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 817100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 817200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 817300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 817400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 817500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 817600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 817700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 817800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 817900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 818000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 818100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 818200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 818300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 818400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 818500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 818600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 818700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 818800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 818900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 819000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 819100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 819200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 819300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 819400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 819500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 819600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 819700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 819800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 819900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 820000 + }, + { + "epoch": 0.0, + "eval_loss": 0.031341552734375, + "eval_runtime": 3287.9169, + "eval_samples_per_second": 342.078, + "eval_steps_per_second": 21.38, + "step": 820000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 820100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 820200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 820300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 820400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 820500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 820600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 820700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 820800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 820900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 821000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 821100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 821200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 821300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 821400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 821500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 821600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 821700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 821800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 821900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 822000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 822100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 822200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 822300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 822400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 822500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 822600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 822700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 822800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 822900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 823000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 823100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 823200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 823300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 823400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 823500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 823600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 823700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 823800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 823900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 824000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 824100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 824200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 824300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 824400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 824500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 824600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 824700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 824800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 824900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 825000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 825100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 825200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 825300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 825400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 825500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 825600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 825700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 825800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 825900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 826000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 826100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 826200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 826300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 826400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 826500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 826600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 826700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 826800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 826900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 827000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 827100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 827200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 827300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 827400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 827500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 827600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 827700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 827800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 827900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 828000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 828100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 828200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 828300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 828400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 828500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 828600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 828700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 828800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 828900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 829000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 829100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 829200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 829300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 829400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 829500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 829600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 829700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 829800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 829900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 830000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 830100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 830200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 830300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 830500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 830600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 830700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 830800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 830900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 831000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 831100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 831200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 831300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 831400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 831500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 831600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 831700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 831800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 831900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 832000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 832100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 832200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 832300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 832400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 832500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 832600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 832700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 832800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 832900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 833000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 833100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 833200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 833300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 833400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 833500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 833600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 833700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 833800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 833900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 834000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 834100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 834200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 834300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 834400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 834500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 834600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 834700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 834800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 834900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 835000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 835100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 835200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 835300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 835400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 835500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 835600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 835700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 835800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 835900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 836000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 836100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 836200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 836300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 836400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 836500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 836600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 836700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 836800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 836900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 837000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 837100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 837200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 837300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 837400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 837500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 837600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 837700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 837800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 837900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 838000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 838100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 838200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 838300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 838400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 838500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 838600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 838700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 838800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 838900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 839000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 839100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 839200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 839300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 839400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 839500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 839600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 839700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 839800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 839900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 840000 + }, + { + "epoch": 0.0, + "eval_loss": 0.031341552734375, + "eval_runtime": 3015.0718, + "eval_samples_per_second": 373.034, + "eval_steps_per_second": 23.315, + "step": 840000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 840100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 840200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 840300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 840400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 840500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 840600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 840700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 840800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 840900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 841000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 841100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 841200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 841300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 841400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 841500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 841600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 841700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 841800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 841900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 842000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 842100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 842200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 842300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 842400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 842500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 842600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 842700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 842800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 842900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 843000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 843100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 843200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 843300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 843400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 843500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 843600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 843700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 843800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 843900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 844000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 844100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 844200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 844300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 844400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 844500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 844600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 844700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 844800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 844900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 845000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 845100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 845200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 845300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 845400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 845500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 845600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 845700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 845800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 845900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 846000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 846100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 846200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 846300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 846400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 846500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 846600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 846700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 846800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 846900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 847000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 847100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 847200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 847300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 847400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 847500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 847600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 847700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 847800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 847900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 848000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 848100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 848200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 848300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 848400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 848500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 848600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 848700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 848800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 848900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 849000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 849100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 849200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 849300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 849400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 849500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 849600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 849700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 849800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 849900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 850000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 850100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 850200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 850300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 850400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 850500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 850600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 850700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 850800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 850900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 851000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 851100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 851200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 851300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 851400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 851500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 851600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 851700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 851800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 851900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 852000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 852100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 852200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 852300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 852400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 852500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 852600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 852700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 852800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 852900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 853000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 853100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 853200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 853300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 853400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 853500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 853600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 853700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 853800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 853900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 854000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 854100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 854200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 854300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 854400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 854500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 854600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 854700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 854800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 854900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 855000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 855100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 855200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 855300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 855400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 855500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 855600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 855700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 855800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 855900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 856000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 856100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 856200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 856300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 856400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 856500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 856600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 856700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 856800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 856900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 857000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 857100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 857200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 857300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 857400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 857500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 857600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 857700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 857800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 857900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 858000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 858100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 858200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 858300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 858400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 858500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 858600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 858700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 858800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 858900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 859000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 859100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 859200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 859300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 859400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 859500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 859600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 859700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 859800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 859900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 860000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0307769775390625, + "eval_runtime": 3051.3726, + "eval_samples_per_second": 368.596, + "eval_steps_per_second": 23.038, + "step": 860000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 860100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 860200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 860300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 860400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 860500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 860600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 860700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 860800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 860900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 861000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 861100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 861200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 861300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 861400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 861500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 861600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 861700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 861800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 861900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 862000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 862100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 862200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 862300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 862400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 862500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 862600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 862700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 862800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 862900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 863000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 863100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 863200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 863300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 863400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 863500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 863600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 863700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 863800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 863900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 864000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 864100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 864200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 864300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 864400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 864500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 864600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 864700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 864800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 864900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 865000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 865100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 865200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 865300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 865400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 865500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 865600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 865700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 865800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 865900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 866000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 866100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 866200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 866300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 866400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 866500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 866600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 866700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 866800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 866900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 867000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 867100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 867200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 867300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 867400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 867500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 867600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 867700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 867800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 867900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 868000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 868100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 868200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 868300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 868400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 868500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 868600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 868700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 868800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 868900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 869000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 869100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 869200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 869300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 869400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 869500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 869600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 869700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 869800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 869900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 870000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 870100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 870200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 870300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 870400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 870500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 870600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 870700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 870800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 870900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 871000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 871100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 871200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 871300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 871400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 871500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 871600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 871700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 871800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 871900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 872000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 872100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 872200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 872300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 872400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 872500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 872600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 872700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 872800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 872900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 873000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 873100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 873200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 873300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 873400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 873500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 873600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 873700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 873800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 873900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 874000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 874100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 874200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 874300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 874400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 874500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 874600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 874700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 874800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 874900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 875000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 875100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 875200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 875300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 875400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 875500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 875600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 875700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 875800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 875900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 876000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 876100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 876200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 876300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 876400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 876500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 876600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 876700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 876800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 876900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 877000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 877100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 877200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 877300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 877400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 877500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 877600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 877700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 877800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 877900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 878000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 878100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 878200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 878300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 878400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 878500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 878600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 878700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 878800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 878900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 879000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 879100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 879200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 879300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 879400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 879500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 879600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 879700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 879800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 879900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 880000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03094482421875, + "eval_runtime": 3080.9329, + "eval_samples_per_second": 365.059, + "eval_steps_per_second": 22.816, + "step": 880000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 880100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 880200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 880300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 880400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 880500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 880600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 880700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 880800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 880900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 881000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 881100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 881200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 881300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 881400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 881500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 881600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 881700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 881800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 881900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 882000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 882100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 882200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 882300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 882400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 882500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 882600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 882700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 882800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 882900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 883000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 883100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 883200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 883300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 883400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 883500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 883600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 883700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 883800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 883900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 884000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 884100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 884200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 884300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 884400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 884500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 884600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 884700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 884800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 884900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 885000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 885100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 885200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 885300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 885400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 885500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 885600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 885700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 885800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 885900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 886000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 886100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 886200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 886300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 886400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 886500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 886600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 886700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 886800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 886900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 887000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 887100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 887200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 887300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 887400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 887500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 887600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 887700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 887800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 887900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 888000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 888100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 888200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 888300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 888400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 888500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 888600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 888700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 888800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 888900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 889000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 889100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 889200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 889300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 889400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 889500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 889600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 889700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 889800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 889900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 890000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 890100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 890200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 890300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 890400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 890500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 890600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 890700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 890800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 890900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 891000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 891100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 891200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 891300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 891400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 891500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 891600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 891700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 891800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 891900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 892000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 892100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 892200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 892300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 892400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 892500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 892600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 892700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 892800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 892900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 893000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 893100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 893200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 893300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 893400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 893500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 893600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 893700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 893800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 893900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 894000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 894100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 894200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 894300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 894400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 894500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 894600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 894700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 894800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 894900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 895000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 895100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 895200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 895300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 895400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 895500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 895600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 895700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 895800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 895900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 896000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 896100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 896200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 896300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 896400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 896500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 896600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 896700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 896800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 896900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 897000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 897100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 897200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 897300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 897400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 897500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 897600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 897700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 897800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 897900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 898000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 898100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 898200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 898300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 898400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 898500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 898600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 898700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 898800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 898900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 899000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 899100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 899200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 899300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 899400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 899500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 899600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 899700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 899800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 899900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 900000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0309295654296875, + "eval_runtime": 3200.3969, + "eval_samples_per_second": 351.432, + "eval_steps_per_second": 21.965, + "step": 900000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 900100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 900200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 900300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 900400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 900500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 900600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 900700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 900800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 900900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 901000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 901100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 901200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 901300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 901400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 901500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 901600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 901700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 901800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 901900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 902000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 902100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 902200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 902300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 902400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 902500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 902600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 902700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 902800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 902900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 903000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 903100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 903200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 903300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 903400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 903500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 903600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 903700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 903800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 903900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 904000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 904100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 904200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 904300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 904400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 904500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 904600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 904700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 904800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 904900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 905000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 905100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 905200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 905300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 905400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 905500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 905600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 905700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 905800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 905900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 906000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 906100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 906200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 906300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 906400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 906500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 906600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 906700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 906800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 906900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 907000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 907100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 907200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 907300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 907400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 907500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 907600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 907700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 907800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 907900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 908000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 908100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 908200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 908300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 908400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 908500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 908600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 908700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 908800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 908900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 909000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 909100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 909200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 909300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 909400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 909500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 909600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 909700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 909800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 909900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 910000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 910100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 910200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 910300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 910400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 910500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 910600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 910700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 910800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 910900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 911000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 911100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 911200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 911300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 911400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 911500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 911600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 911700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 911800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 911900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 912000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 912100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 912200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 912300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 912400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 912500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 912600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 912700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 912800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 912900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 913000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 913100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 913200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 913300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 913400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 913500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 913600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 913700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 913800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 913900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 914000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 914100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 914200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 914300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 914400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 914500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 914600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 914700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 914800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 914900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 915000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 915100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 915200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 915300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 915400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 915500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 915600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 915700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 915800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 915900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 916000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 916100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 916200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 916300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 916400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 916500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 916600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 916700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 916800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 916900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 917000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 917100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 917200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 917300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 917400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 917500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 917600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 917700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 917800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 917900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 918000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 918100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 918200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 918300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 918400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 918500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 918600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 918700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 918800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 918900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 919000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 919100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 919200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 919300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 919400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 919500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 919600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 919700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 919800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 919900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 920000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0310516357421875, + "eval_runtime": 3211.005, + "eval_samples_per_second": 350.271, + "eval_steps_per_second": 21.892, + "step": 920000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 920100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 920200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 920300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 920400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 920500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 920600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 920700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 920800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 920900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 921000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 921100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 921200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 921300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 921400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 921500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 921600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 921700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 921800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 921900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 922000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 922100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 922200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 922300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 922400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 922500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 922600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 922700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 922800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 922900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 923000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 923100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 923200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 923300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 923400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 923500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 923600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 923700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 923800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 923900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 924000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 924100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 924200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 924300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 924400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 924500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 924600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 924700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 924800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 924900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 925000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 925100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 925200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 925300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 925400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 925500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 925600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 925700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 925800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 925900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 926000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 926100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 926200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 926300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 926400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 926500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 926600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 926700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 926800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 926900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 927000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 927100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 927200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 927300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 927400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 927500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 927600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 927700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 927800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 927900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 928000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 928100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 928200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 928300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 928400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 928500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 928600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 928700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 928800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 928900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 929000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 929100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 929200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 929300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 929400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 929500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 929600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 929700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 929800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 929900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 930000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 930100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 930200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 930300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 930400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 930500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 930600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 930700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 930800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 930900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 931000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 931100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 931200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 931300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 931400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 931500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 931600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 931700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 931800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 931900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 932000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 932100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 932200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 932300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 932400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 932500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 932600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 932700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 932800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 932900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 933000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 933100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 933200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 933300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 933400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 933500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 933600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 933700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 933800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 933900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 934000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 934100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 934200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 934300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 934400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 934500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 934600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 934700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 934800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 934900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 935000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 935100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 935200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 935300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 935400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 935500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 935600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 935700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 935800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 935900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 936000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 936100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 936200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 936300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 936400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 936500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 936600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 936700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 936800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 936900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 937000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 937100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 937200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 937300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 937400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 937500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 937600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 937700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 937800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 937900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 938000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 938100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 938200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 938300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 938400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 938500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 938600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 938700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 938800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 938900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 939000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 939100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 939200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 939300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 939400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 939500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 939600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 939700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 939800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 939900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 940000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0307159423828125, + "eval_runtime": 3409.0727, + "eval_samples_per_second": 329.921, + "eval_steps_per_second": 20.62, + "step": 940000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 940100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 940200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 940300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 940400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 940500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 940600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 940700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 940800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 940900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 941000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 941100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 941200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 941300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 941400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 941500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 941600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 941700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 941800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 941900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 942000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 942100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 942200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 942300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 942400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 942500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 942600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 942700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 942800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 942900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 943000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 943100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 943200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 943300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 943400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 943500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 943600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 943700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 943800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 943900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 944000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 944100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 944200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 944300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 944400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 944500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 944600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 944700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 944800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 944900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 945000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 945100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 945200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 945300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 945400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 945500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 945600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 945700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 945800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 945900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 946000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 946100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 946200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 946300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 946400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 946500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 946600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 946700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 946800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 946900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 947000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 947100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 947200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 947300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 947400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 947500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 947600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 947700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 947800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 947900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 948000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 948100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 948200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 948300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 948400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 948500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 948600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 948700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 948800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 948900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 949000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 949100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 949200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 949300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 949400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 949500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 949600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 949700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 949800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 949900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 950000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 950100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 950200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 950300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 950400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 950500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 950600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 950700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 950800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 950900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 951000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 951100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 951200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 951300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 951400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 951500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 951600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 951700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 951800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 951900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 952000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 952100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 952200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 952300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 952400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 952500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 952600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 952700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 952800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 952900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 953000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 953100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 953200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 953300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 953400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 953500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 953600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 953700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 953800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 953900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 954000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 954100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 954200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 954300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 954400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 954500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 954600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 954700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 954800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 954900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 955000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 955100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 955200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 955300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 955400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 955500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 955600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 955700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 955800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 955900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 956000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 956100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 956200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 956300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 956400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 956500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 956600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 956700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 956800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 956900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 957000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 957100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 957200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 957300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 957400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 957500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 957600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 957700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 957800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 957900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 958000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 958100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 958200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 958300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 958400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 958500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 958600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 958700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 958800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 958900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 959000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 959100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 959200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 959300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 959400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 959500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 959600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 959700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 959800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 959900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 960000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0309295654296875, + "eval_runtime": 3121.7204, + "eval_samples_per_second": 360.289, + "eval_steps_per_second": 22.518, + "step": 960000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 960100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 960200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 960300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 960400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 960500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 960600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 960700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 960800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 960900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 961000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 961100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 961200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 961300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 961400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 961500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 961600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 961700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 961800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 961900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 962000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 962100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 962200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 962300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 962400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 962500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 962600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 962700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 962800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 962900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 963000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 963100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 963200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 963300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 963400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 963500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 963600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 963700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 963800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 963900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 964000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 964100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 964200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 964300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 964400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 964500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 964600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 964700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 964800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 964900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 965000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 965100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 965200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 965300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 965400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 965500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 965600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 965700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 965800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 965900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 966000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 966100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 966200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 966300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 966400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 966500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 966600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 966700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 966800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 966900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 967000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 967100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 967200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 967300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 967400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 967500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 967600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 967700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 967800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 967900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 968000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 968100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 968200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 968300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 968400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 968500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 968600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 968700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 968800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 968900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 969000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 969100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 969200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 969300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 969400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 969500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 969600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 969700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 969800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 969900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 970000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 970100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 970200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 970300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 970400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 970500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 970600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 970700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 970800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 970900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 971000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 971100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 971200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 971300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 971400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 971500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 971600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 971700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 971800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 971900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 972000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 972100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 972200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 972300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 972400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 972500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 972600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 972700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 972800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 972900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 973000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 973100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 973200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 973300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 973400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 973500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 973600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 973700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 973800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 973900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 974000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 974100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 974200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 974300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 974400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 974500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 974600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 974700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 974800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 974900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 975000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 975100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 975200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 975300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 975400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 975500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 975600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 975700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 975800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 975900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 976000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 976100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 976200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 976300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 976400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 976500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 976600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 976700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 976800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 976900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 977000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 977100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 977200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 977300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 977400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 977500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 977600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 977700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 977800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 977900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 978000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 978100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 978200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 978300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 978400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 978500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 978600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 978700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 978800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 978900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 979000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 979100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 979200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 979300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 979400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 979500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 979600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 979700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 979800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 979900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 980000 + }, + { + "epoch": 0.0, + "eval_loss": 0.030792236328125, + "eval_runtime": 3092.5375, + "eval_samples_per_second": 363.689, + "eval_steps_per_second": 22.731, + "step": 980000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 980100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 980200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 980300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 980400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 980500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 980600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 980700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 980800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 980900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 981000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 981100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 981200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 981300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 981400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 981500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 981600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 981700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 981800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 981900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 982000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 982100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 982200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 982300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 982400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 982500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 982600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 982700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 982800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 982900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 983000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 983100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 983200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 983300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 983400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 983500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 983600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 983700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 983800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 983900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 984000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 984100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 984200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 984300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 984400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 984500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 984600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 984700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 984800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 984900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 985000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 985100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 985200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 985300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 985400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 985500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 985600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 985700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 985800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 985900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 986000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 986100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 986200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 986300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 986400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 986500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 986600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 986700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 986800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 986900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 987000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 987100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 987200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 987300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 987400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 987500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 987600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 987700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 987800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 987900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 988000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 988100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 988200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 988300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 988400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 988500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 988600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 988700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 988800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 988900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 989000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 989100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 989200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 989300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 989400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 989500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 989600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 989700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 989800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 989900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 990000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 990100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 990200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 990300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 990400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 990500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 990600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 990700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 990800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 990900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 991000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 991100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 991200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 991300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 991400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 991500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 991600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 991700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 991800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 991900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 992000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 992100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 992200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 992300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 992400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 992500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 992600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 992700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 992800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 992900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 993000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 993100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 993200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 993300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 993400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 993500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 993600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 993700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 993800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 993900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 994000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 994100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 994200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 994300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 994400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 994500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 994600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 994700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 994800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 994900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 995000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 995100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 995200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 995300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 995400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 995500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 995600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 995700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 995800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 995900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 996000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 996100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 996200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 996300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 996400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 996500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 996600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 996700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 996800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 996900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 997000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 997100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 997200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 997300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 997400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 997500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 997600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 997700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 997800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 997900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 998000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 998100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 998200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 998300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 998400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 998500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 998600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 998700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 998800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 998900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 999000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 999100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 999200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 999300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 999400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 999500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 999600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 999700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 999800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 999900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1000000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03070068359375, + "eval_runtime": 3196.3803, + "eval_samples_per_second": 351.874, + "eval_steps_per_second": 21.992, + "step": 1000000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1000100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1000200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 1000300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1000400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1000500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1000600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1000700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1000800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1000900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1001000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1001100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1001200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1001300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1001400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1001500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1001600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1001700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1001800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1001900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1002000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1002100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1002200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1002300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 1002400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1002500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1002600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 1002700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1002800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1002900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1003000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1003100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1003200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1003300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1003400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1003500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1003600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1003700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1003800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1003900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1004000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1004100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1004200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1004300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1004400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1004500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1004600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1004700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1004800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1004900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1005000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1005100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1005200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1005300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1005400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1005500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1005600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1005700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1005800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1005900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1006000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1006100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1006200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1006300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1006400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1006500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1006600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 1006700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1006800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1006900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1007000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1007100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1007200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1007300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1007400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1007500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1007600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1007700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1007800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1007900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1008000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1008100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1008200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1008300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1008400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1008500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1008600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1008700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1008800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1008900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1009000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1009100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1009200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1009300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1009400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1009500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1009600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1009700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1009800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1009900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1010000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1010100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1010200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1010300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1010400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1010500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1010600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1010700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1010800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1010900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1011000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1011100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1011200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1011300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1011400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1011500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1011600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1011700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1011800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1011900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1012000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1012100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1012200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1012300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1012400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1012500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1012600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1012700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1012800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1012900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1013000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1013100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1013200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1013300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 1013400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1013500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1013600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1013700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1013800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1013900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1014000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1014100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1014200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1014300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1014400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1014500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1014600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1014700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1014800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1014900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1015000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1015100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1015200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1015300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1015400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1015500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1015600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1015700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1015800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1015900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1016000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1016100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1016200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1016300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1016400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1016500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1016600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1016700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1016800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1016900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1017000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1017100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1017200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1017300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1017400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1017500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1017600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1017700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1017800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1017900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1018000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1018100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1018200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1018300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1018400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1018500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1018600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1018700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1018800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1018900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1019000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1019100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1019200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1019300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1019400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1019500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1019600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1019700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1019800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1019900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1020000 + }, + { + "epoch": 0.0, + "eval_loss": 0.030609130859375, + "eval_runtime": 3266.2908, + "eval_samples_per_second": 344.343, + "eval_steps_per_second": 21.522, + "step": 1020000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1020100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1020200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1020300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1020400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1020500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1020600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1020700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1020800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1020900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1021000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1021100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1021200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1021300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1021400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1021500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1021600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1021700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1021800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1021900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1022000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1022100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1022200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1022300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1022400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1022500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1022600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1022700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1022800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1022900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1023000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1023100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1023200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1023300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1023400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1023500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1023600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1023700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1023800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1023900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1024000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1024100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1024200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1024300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1024400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1024500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1024600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1024700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1024800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1024900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1025000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1025100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1025200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1025300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1025400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1025500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1025600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1025700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1025800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1025900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1026000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 1026100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1026200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1026300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1026400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1026500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1026600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1026700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1026800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 1026900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1027000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1027100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1027200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1027300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1027400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1027500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1027600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1027700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1027800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1027900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1028000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1028100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1028200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1028300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1028400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1028500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1028600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1028700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1028800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1028900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1029000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1029100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1029200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1029300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1029400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1029500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1029600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1029700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1029800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1029900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1030000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1030100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1030200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1030300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1030400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1030500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1030600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1030700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1030800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1030900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1031000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1031100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1031200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1031300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1031400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1031500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1031600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1031700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1031800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1031900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1032000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1032100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1032200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1032300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1032400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1032500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1032600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1032700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1032800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1032900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1033000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1033100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1033200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1033300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1033400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1033500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1033600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1033700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1033800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1033900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1034000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1034100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1034200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1034300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1034400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1034500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1034600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1034700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1034800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1034900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1035000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1035100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1035200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1035300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1035400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1035500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1035600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1035700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1035800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1035900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1036000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1036100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1036200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1036300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1036400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1036500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1036600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1036700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1036800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1036900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1037000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1037100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1037200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1037300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1037400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1037500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1037600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1037700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1037800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1037900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1038000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1038100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1038200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1038300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1038400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1038500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1038600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1038700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1038800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1038900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1039000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1039100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1039200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1039300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1039400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1039500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1039600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1039700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1039800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1039900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1040000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0306549072265625, + "eval_runtime": 3024.1998, + "eval_samples_per_second": 371.908, + "eval_steps_per_second": 23.244, + "step": 1040000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1040100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1040200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1040300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1040400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1040500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1040600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1040700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1040800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1040900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1041000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1041100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1041200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1041300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1041400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1041500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1041600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1041700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1041800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1041900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1042000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1042100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1042200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1042300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1042400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1042500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1042600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1042700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1042800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 1042900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1043000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1043100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1043200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1043300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1043400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1043500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1043600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1043700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1043800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1043900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1044000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1044100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1044200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1044300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1044400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1044500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1044600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1044700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1044800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1044900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1045000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1045100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1045200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1045300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1045400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1045500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1045600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1045700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1045800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1045900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1046000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1046100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1046200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1046300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1046400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1046500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1046600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1046700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1046800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1046900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1047000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1047100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1047200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1047300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1047400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1047500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1047600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1047700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1047800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1047900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1048000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1048100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1048200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1048300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1048400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1048500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1048600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1048700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1048800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1048900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1049000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1049100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1049200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1049300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1049400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1049500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1049600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1049700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1049800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1049900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1050000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1050100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1050200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1050300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1050400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1050500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1050600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1050700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1050800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1050900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1051000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1051100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1051200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1051300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1051400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1051500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1051600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1051700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1051800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1051900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1052000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1052100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1052200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1052300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1052400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1052500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1052600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1052700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1052800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1052900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1053000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1053100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1053200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1053300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1053400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1053500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1053600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1053700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1053800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1053900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1054000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1054100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1054200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1054300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1054400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1054500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1054600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 1054700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1054800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1054900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1055000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1055100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1055200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1055300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1055400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1055500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 1055600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1055700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1055800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1055900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1056000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1056100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1056200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1056300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1056400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1056500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1056600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1056700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1056800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1056900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1057000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1057100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1057200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1057300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1057400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1057500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1057600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1057700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1057800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1057900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1058000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1058100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1058200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1058300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1058400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1058500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1058600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1058700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1058800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1058900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1059000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1059100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1059200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1059300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1059400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1059500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1059600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1059700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1059800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1059900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1060000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0303802490234375, + "eval_runtime": 3085.9361, + "eval_samples_per_second": 364.467, + "eval_steps_per_second": 22.779, + "step": 1060000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1060100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1060200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1060300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1060400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1060500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1060600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1060700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1060800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1060900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1061000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1061100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1061200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1061300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1061400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1061500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1061600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1061700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1061800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1061900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1062000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1062100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1062200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1062300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1062400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1062500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1062600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1062700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1062800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1062900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1063000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1063100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1063200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1063300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1063400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1063500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1063600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1063700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1063800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1063900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1064000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1064100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1064200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1064300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1064400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1064500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1064600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1064700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1064800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1064900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1065000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1065100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1065200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1065300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1065400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1065500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1065600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1065700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1065800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1065900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1066000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1066100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1066200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1066300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1066400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1066500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1066600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1066700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1066800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1066900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1067000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1067100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1067200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1067300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1067400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1067500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1067600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1067700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1067800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1067900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1068000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1068100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1068200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1068300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1068400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1068500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1068600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1068700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 1068800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1068900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1069000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1069100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1069200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1069300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1069400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 1069500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1069600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1069700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1069800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1069900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1070000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1070100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1070200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1070300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1070400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1070500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1070600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1070700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1070800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1070900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1071000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1071100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1071200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1071300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1071400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1071500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1071600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1071700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1071800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1071900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1072000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1072100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1072200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1072300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1072400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1072500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1072600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1072700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1072800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1072900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1073000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1073100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1073200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1073300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1073400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1073500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1073600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1073700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1073800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1073900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1074000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1074100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1074200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 1074300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1074400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1074500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1074600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1074700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1074800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1074900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1075000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1075100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1075200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1075300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1075400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1075500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1075600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1075700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1075800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1075900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 1076000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1076100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1076200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1076300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1076400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1076500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1076600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1076700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1076800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1076900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1077000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1077100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1077200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1077300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1077400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1077500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1077600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1077700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1077800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1077900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1078000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1078100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1078200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1078300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1078400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1078500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1078600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1078700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1078800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1078900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1079000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1079100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1079200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1079300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1079400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1079500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1079600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1079700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1079800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1079900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1080000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0302734375, + "eval_runtime": 3131.3287, + "eval_samples_per_second": 359.184, + "eval_steps_per_second": 22.449, + "step": 1080000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1080100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1080200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1080300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1080400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1080500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1080600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1080700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1080800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1080900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 1081000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1081100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1081200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1081300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1081400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1081500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1081600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1081700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1081800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1081900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1082000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1082100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1082200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1082300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1082400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1082500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1082600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1082700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1082800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1082900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1083000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1083100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1083200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1083300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1083400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1083500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1083600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1083700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1083800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1083900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 1084000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1084100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1084200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1084300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1084400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1084500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1084600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1084700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1084800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1084900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1085000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1085100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1085200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1085300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1085400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1085500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1085600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1085700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1085800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1085900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1086000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1086100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1086200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1086300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1086400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1086500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1086600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1086700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1086800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1086900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1087000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1087100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1087200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1087300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1087400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1087500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1087600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1087700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1087800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1087900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1088000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1088100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1088200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1088300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1088400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1088500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1088600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1088700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1088800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1088900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1089000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1089100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1089200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1089300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1089400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1089500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1089600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1089700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1089800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1089900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1090000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1090100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1090200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1090300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1090400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1090500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1090600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1090700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1090800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1090900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1091000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1091100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1091200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1091300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1091400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1091500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1091600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1091700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1091800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1091900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1092000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1092100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1092200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1092300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1092400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1092500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1092600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1092700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1092800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1092900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1093000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1093100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1093200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1093300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1093400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1093500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1093600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1093700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1093800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1093900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1094000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1094100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1094200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1094300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1094400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1094500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1094600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1094700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1094800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1094900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1095000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1095100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1095200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1095300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1095400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1095500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1095600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1095700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1095800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1095900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1096000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1096100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1096200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1096300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1096400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1096500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1096600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1096700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1096800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1096900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1097000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1097100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1097200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1097300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1097400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1097500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1097600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1097700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1097800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1097900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1098000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1098100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1098200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1098300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1098400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1098500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1098600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1098700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1098800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1098900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1099000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1099100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1099200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1099300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1099400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1099500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1099600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1099700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1099800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1099900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1100000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0305328369140625, + "eval_runtime": 3084.7934, + "eval_samples_per_second": 364.602, + "eval_steps_per_second": 22.788, + "step": 1100000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1100100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1100200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1100300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1100400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1100500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1100600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1100700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1100800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1100900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1101000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1101100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1101200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1101300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1101400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1101500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1101600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1101700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1101800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1101900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1102000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1102100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1102200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1102300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1102400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1102500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1102600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1102700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1102800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1102900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1103000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1103100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1103200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1103300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1103400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1103500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1103600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1103700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1103800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1103900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1104000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1104100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1104200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1104300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1104400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1104500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1104600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1104700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1104800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1104900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1105000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1105100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1105200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1105300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1105400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1105500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1105600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1105700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1105800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1105900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1106000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1106100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1106200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1106300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1106400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1106500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1106600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1106700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1106800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1106900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1107000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1107100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1107300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1107400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1107500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1107600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1107700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1107800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1107900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1108000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1108100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1108200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1108300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1108400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1108500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1108600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1108700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1108800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1108900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1109000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1109100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1109200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1109300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1109400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1109500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1109600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1109700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1109800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1109900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1110000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1110100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1110200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1110300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1110400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1110500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1110600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1110700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1110800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1110900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1111000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1111100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1111200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1111300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1111400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1111500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1111600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1111700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1111800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1111900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1112000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1112100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1112200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1112300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1112400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1112500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1112600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1112700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1112800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1112900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1113000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1113100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1113200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1113300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1113400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1113500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1113600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1113700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1113800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1113900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1114000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1114100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1114200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1114300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1114400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1114500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1114600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1114700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1114800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1114900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1115000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1115100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1115200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1115300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1115400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1115500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1115600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1115700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1115800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1115900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1116000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1116100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1116200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1116300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1116400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1116500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1116600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1116700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1116800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1116900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1117000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1117100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1117200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1117300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1117400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1117500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1117600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1117700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1117800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1117900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1118000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1118100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1118200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1118300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1118400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1118500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1118600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1118700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1118800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1118900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1119000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1119100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1119200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1119300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1119400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1119500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1119600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1119700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1119800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1119900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1120000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0302276611328125, + "eval_runtime": 3228.7917, + "eval_samples_per_second": 348.342, + "eval_steps_per_second": 21.772, + "step": 1120000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1120100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1120200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1120300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1120400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1120500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1120600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1120700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1120800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1120900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1121000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1121100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1121200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1121300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1121400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1121500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1121600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1121700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1121800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1121900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1122000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1122100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1122200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1122300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1122400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1122500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1122600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1122700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1122800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1122900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1123000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1123100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1123200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1123300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1123400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1123500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1123600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1123700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1123800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1123900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1124000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1124100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1124200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1124300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1124400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1124500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1124600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1124700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1124800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1124900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1125000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1125100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1125200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1125300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1125400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1125500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1125600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1125700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1125800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1125900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1126000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1126100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1126200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1126300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1126400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1126500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1126600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1126700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1126800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1126900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1127000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1127100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1127200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1127300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1127400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1127500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1127600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1127700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1127800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1127900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1128000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1128100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1128200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1128300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1128400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1128500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1128600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1128700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1128800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1128900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1129000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1129100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1129200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1129300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1129400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1129500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1129600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1129700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1129800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1129900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1130000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1130100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1130200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1130300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1130400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1130500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1130600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1130700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1130800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1130900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1131000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1131100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1131200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1131300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1131400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1131500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1131600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1131700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1131800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1131900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1132000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1132100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1132200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1132300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1132400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1132500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1132600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1132700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1132800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1132900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1133000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1133100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1133200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1133300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1133400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1133500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1133600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1133700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1133800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1133900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1134000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1134100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1134200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1134300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1134400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1134500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1134600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1134700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1134800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1134900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1135000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1135100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1135200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1135300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1135400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1135500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1135600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1135700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1135800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1135900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1136000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1136100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1136200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1136300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1136400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1136500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1136600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1136700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1136800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1136900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1137000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1137100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1137200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1137300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1137400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1137500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1137600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1137700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1137800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1137900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1138000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1138100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1138200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1138300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1138400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1138500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1138600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1138700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1138800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1138900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1139000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1139100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1139200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1139300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1139400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1139500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1139600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1139700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1139800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1139900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1140000 + }, + { + "epoch": 0.0, + "eval_loss": 0.030303955078125, + "eval_runtime": 3150.8709, + "eval_samples_per_second": 356.956, + "eval_steps_per_second": 22.31, + "step": 1140000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1140100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1140200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1140300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1140400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1140500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1140600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1140700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1140800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1140900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1141000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1141100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1141200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1141300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1141400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1141500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1141600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1141700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1141800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1141900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1142000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1142100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1142200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1142300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1142400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1142500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1142600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1142700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1142800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1142900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1143000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1143100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1143200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1143300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1143400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1143500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1143600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1143700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1143800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1143900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1144000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1144100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1144200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1144300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1144400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1144500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1144600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1144700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1144800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1144900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1145000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1145100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1145200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1145300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1145400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1145500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1145600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1145700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1145800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1145900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1146000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1146100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1146200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1146300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1146400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1146500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1146600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1146700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1146800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1146900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1147000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1147100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1147200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1147300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1147400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1147500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1147600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1147700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1147800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1147900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1148000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1148100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1148200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1148300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1148400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1148500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1148600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1148700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1148800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1148900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1149000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1149100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1149200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1149300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1149400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1149500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1149600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1149700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1149800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1149900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1150000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1150100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1150200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1150300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1150400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1150500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1150600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1150700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1150800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1150900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1151000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1151100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1151200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1151300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1151400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1151500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1151600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1151700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1151800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1151900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1152000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1152100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1152200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1152300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1152400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1152500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1152600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1152700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1152800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1152900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1153000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1153100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1153200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1153300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1153400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1153500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1153600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1153700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1153800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1153900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1154000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1154100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1154200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1154300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1154400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1154500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1154600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1154700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1154800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1154900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1155000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1155100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1155200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1155300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1155400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1155500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1155600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1155700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1155800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1155900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1156000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1156100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1156200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1156300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1156400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1156500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1156600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1156700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1156800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1156900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1157000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1157100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1157200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1157300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1157400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1157500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1157600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1157700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1157800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1157900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1158000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1158100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1158200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1158300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1158400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1158500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1158600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1158700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1158800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1158900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1159000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1159100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1159200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1159300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1159400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1159500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1159600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1159700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1159800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1159900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1160000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0301055908203125, + "eval_runtime": 3130.1048, + "eval_samples_per_second": 359.324, + "eval_steps_per_second": 22.458, + "step": 1160000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1160100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1160200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1160300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1160400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1160500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1160600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1160700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1160800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1160900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1161000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1161100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1161200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1161300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1161400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1161500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1161600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1161700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1161800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1161900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1162000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1162100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1162200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1162300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1162400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1162500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1162600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1162700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1162800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1162900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1163000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1163100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1163200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1163300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1163400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1163500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1163600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1163700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1163800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1163900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1164000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1164100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1164200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1164300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1164400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1164500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1164600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1164700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1164800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1164900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1165000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1165100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1165200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1165300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1165400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1165500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1165600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1165700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1165800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1165900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1166000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1166100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1166200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1166300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1166400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1166500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1166600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1166700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1166800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1166900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1167000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1167100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1167200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1167300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1167400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1167500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1167600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1167700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1167800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1167900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1168000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1168100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1168200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1168300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1168400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1168500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1168600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1168700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1168800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1168900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1169000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1169100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1169200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1169300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1169400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1169500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1169600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1169700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1169800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1169900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1170000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1170100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1170200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1170300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1170400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1170500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1170600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1170700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1170800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1170900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1171000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1171100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1171200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1171300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1171400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1171500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1171600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1171700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1171800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1171900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1172000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1172100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1172200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1172300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1172400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1172500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1172600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1172700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1172800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1172900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1173000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1173100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1173200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1173300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1173400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1173500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1173600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1173700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1173800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1173900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1174000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1174100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1174200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1174300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1174400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1174500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1174600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1174700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1174800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1174900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1175000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1175100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1175200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1175300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1175400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1175500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1175600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1175700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1175800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1175900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1176000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1176100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1176200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1176300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1176400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1176500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1176600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1176700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1176800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1176900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1177000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1177100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1177200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1177300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1177400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1177500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1177600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1177700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1177800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1177900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1178000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1178100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1178200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1178300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1178400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1178500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1178600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1178700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1178800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1178900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1179000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1179100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1179200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1179300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1179400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1179500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1179600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1179700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1179800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1179900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1180000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02996826171875, + "eval_runtime": 3211.297, + "eval_samples_per_second": 350.239, + "eval_steps_per_second": 21.89, + "step": 1180000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1180100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1180200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1180300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1180400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1180500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1180600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1180700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1180800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1180900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1181000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1181100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1181200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1181300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1181400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1181500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1181600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1181700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1181800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1181900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1182000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1182100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1182200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1182300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1182400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1182500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1182600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1182700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1182800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1182900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1183000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1183100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1183200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1183300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1183400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1183500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1183600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1183700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1183800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1183900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1184000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1184100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1184200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1184300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1184400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1184500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1184600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1184700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1184800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1184900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1185000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1185100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1185200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1185300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1185400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1185500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1185600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1185700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1185800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1185900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1186000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1186100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1186200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1186300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1186400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1186500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1186600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 1186700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1186800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1186900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1187000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1187100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1187200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1187300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1187400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1187500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1187600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1187700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1187800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1187900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1188000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1188100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1188200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1188300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1188400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1188500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1188600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1188700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1188800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1188900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1189000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1189100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1189200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1189300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1189400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1189500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1189600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1189700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1189800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1189900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1190000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1190100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1190200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1190300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1190400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1190500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1190600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1190700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1190800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1190900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1191000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1191100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1191200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1191300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1191400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1191500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1191600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1191700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1191800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1191900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1192100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1192200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1192300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1192400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1192500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1192600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1192700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1192800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1192900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1193000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1193100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1193200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1193300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1193400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1193500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1193600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1193700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1193800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1193900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1194000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1194100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1194200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1194300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1194400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1194500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1194600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1194700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1194800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1194900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1195000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1195100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1195200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1195300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1195400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1195500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1195600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1195700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1195800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1195900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1196000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1196100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1196200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1196300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1196400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1196500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1196600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1196700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1196800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1196900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1197000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1197100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1197200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1197300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1197400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1197500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1197600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1197700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1197800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1197900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1198000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1198100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1198200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1198300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1198400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1198500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1198600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1198700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1198800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1198900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1199000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1199100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1199200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1199300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1199400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1199500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1199600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1199700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1199800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1199900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1200000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0304412841796875, + "eval_runtime": 4126.4351, + "eval_samples_per_second": 272.565, + "eval_steps_per_second": 17.036, + "step": 1200000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1200100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1200200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1200300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1200400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1200500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1200600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1200700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1200800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1200900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1201000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1201100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1201200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1201300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1201400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1201500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1201600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1201700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1201800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1201900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1202000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1202100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1202200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1202300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1202400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1202500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1202600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1202700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1202800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1202900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1203000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1203100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1203200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 1203300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1203400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1203500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1203600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1203700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1203800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1203900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1204000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1204100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1204200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1204300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1204400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1204500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1204600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1204700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1204800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1204900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1205000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1205100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1205200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1205300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1205400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1205500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1205600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1205700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1205800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1205900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1206000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1206100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1206200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1206300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1206400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1206500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1206600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1206700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1206800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1206900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1207000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1207100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1207200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1207300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1207400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1207500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1207600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1207700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1207800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1207900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1208000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1208100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1208200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1208300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1208400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1208500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1208600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1208700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1208800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1208900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1209000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1209100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1209200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1209300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1209400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1209500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1209600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1209700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1209800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1209900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1210000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1210100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1210200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1210300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1210400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1210500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1210600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1210700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1210800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1210900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1211000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1211100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1211200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1211300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1211400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1211500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1211600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1211700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1211800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1211900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1212000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1212100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1212200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1212300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1212400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1212500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1212600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1212700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1212800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1212900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1213000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1213100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1213200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1213300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1213400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1213500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1213600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1213700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1213800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1213900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1214000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1214100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1214200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1214300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1214400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1214500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1214600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1214700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1214800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1214900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1215000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1215100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1215200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1215300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1215400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1215500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1215600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1215700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1215800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1215900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1216000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1216100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1216200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1216300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1216400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1216500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1216600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1216700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1216800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1216900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1217000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1217100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1217200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1217300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1217400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1217500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1217600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1217700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1217800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1217900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1218000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1218100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1218200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1218300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1218400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1218500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1218600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1218700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1218800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1218900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1219000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1219100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1219200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1219300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1219400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1219500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1219600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1219700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1219800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1219900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1220000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0304107666015625, + "eval_runtime": 3976.69, + "eval_samples_per_second": 282.829, + "eval_steps_per_second": 17.677, + "step": 1220000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1220100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1220200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1220300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1220400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1220500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1220600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1220700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1220800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1220900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1221000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1221100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1221200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1221300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1221400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1221500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1221600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1221700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 1221800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1221900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1222000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1222100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1222200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1222300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1222400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1222500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1222600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1222700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1222800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1222900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1223000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1223100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1223200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1223300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1223400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1223500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1223600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1223700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1223800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1223900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1224000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1224100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1224200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1224300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1224400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1224500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1224600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1224700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1224800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1224900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1225000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1225100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1225200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1225300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1225400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1225500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1225600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1225700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1225800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1225900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1226000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1226100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1226200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1226300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1226400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1226500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1226600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1226700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1226800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1226900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1227000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1227100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1227200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1227300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1227400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1227500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1227600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1227700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1227800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1227900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1228000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1228100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1228200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1228300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1228400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1228500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1228600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1228700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1228800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1228900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1229000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1229100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1229200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 1229300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1229400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1229500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0412, + "step": 1229600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1229700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1229800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1229900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1230000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1230100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1230200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1230300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1230400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1230500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1230600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1230700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1230800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1230900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1231000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1231100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1231200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1231300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1231400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1231500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1231600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1231700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1231800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1231900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1232000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1232100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1232200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1232300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1232400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1232500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1232600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1232700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1232800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1232900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1233000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1233100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1233200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1233300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1233400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1233500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1233600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1233700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1233800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1233900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1234000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1234100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1234200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1234300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1234400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1234500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1234600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1234700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1234800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1234900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1235000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1235100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1235200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1235300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1235400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1235500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1235600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1235700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1235800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1235900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1236000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1236100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1236200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1236300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1236400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1236500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1236600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1236700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1236800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1236900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 1237000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1237100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1237200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1237300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1237400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1237500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1237600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1237700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1237800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1237900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1238000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1238100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1238200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1238300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1238400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1238500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1238600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1238700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1238800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1238900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1239000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1239100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1239200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1239300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1239400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1239500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1239600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1239700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1239800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1239900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1240000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03009033203125, + "eval_runtime": 3986.2287, + "eval_samples_per_second": 282.152, + "eval_steps_per_second": 17.635, + "step": 1240000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1240100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1240200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1240300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1240400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1240500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1240600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1240700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1240800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1240900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1241000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1241100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1241200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1241300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1241400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1241500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1241600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1241700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1241800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1241900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1242000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1242100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1242200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1242300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1242400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1242500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1242600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1242700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1242800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1242900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1243000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1243100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1243200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1243300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1243400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1243500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1243600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1243700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1243800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1243900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1244000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1244100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1244200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1244300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1244400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1244500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1244600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1244700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1244800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1244900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1245000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1245100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1245200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1245300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1245400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1245500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1245600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1245700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1245800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1245900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1246000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1246100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1246200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1246300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1246400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1246500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1246600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1246700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1246800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1246900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1247000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1247100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1247200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1247300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1247400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1247500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1247600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1247700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1247800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1247900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1248000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1248100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1248200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1248300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1248400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1248500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1248600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1248700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1248800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1248900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1249000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1249100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1249200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1249300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1249400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1249500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1249600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1249700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1249800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1249900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1250000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1250100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1250200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1250300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1250400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1250500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1250600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1250700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1250800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1250900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1251000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1251100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1251200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1251300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1251400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1251500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1251600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1251700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1251800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1251900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1252000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1252100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1252200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1252300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1252400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1252500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1252600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1252700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1252800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1252900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1253000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1253100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1253200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1253300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1253400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1253500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1253600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1253700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1253800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1253900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1254000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1254100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1254200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1254300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1254400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1254500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1254600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1254700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1254800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1254900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1255000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1255100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1255200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1255300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1255400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1255500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1255600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1255700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1255800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1255900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1256000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1256100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1256200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1256300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1256400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1256500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1256600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1256700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1256800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1256900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1257000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1257100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1257200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1257300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1257400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1257500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1257600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1257700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1257800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1257900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1258000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1258100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1258200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1258300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1258400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1258500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1258600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1258700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1258800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1258900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1259000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1259100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1259200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1259300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1259400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1259500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1259600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1259700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1259800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1259900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1260000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0300445556640625, + "eval_runtime": 3604.2377, + "eval_samples_per_second": 312.056, + "eval_steps_per_second": 19.504, + "step": 1260000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1260100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1260200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1260300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1260400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1260500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1260600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1260700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1260800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1260900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1261000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1261100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1261200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1261300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1261400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1261500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1261600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1261700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1261800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1261900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1262000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1262100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1262200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1262300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1262400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1262500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1262600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1262700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1262800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1262900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1263000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1263100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1263200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1263300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1263400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1263500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1263600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1263700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1263800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1263900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1264000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1264100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1264200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1264300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1264400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1264500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1264600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1264700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1264800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1264900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1265000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1265100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1265200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1265300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1265400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1265500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1265600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1265700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1265800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1265900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1266000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1266100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1266200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1266300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1266400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1266500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1266600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1266700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1266800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1266900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1267000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1267100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1267200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1267300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1267400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1267500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1267600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1267700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1267800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1267900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1268000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1268100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1268200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1268300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1268400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1268500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1268600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1268700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1268800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1268900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1269000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1269100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1269200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1269300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1269400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1269500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1269600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1269700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1269800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1269900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1270000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1270100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1270200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1270300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1270400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1270500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1270600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1270700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1270800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1270900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1271000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1271100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1271200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1271300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1271400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1271500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1271600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1271700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1271800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1271900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1272000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1272100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1272200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1272300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1272400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1272500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1272600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 1272700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 1272800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1272900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1273000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1273100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1273200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1273300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1273400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1273500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1273600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1273700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1273800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1273900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1274000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1274100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1274200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1274300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1274400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1274500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1274600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1274700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1274800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1274900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1275000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1275100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1275200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1275300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1275400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1275500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1275600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1275700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1275800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1275900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1276000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1276100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1276200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1276300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1276400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1276500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1276600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1276700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1276800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1276900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1277000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1277100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1277200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1277300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1277400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1277500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1277600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1277700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1277800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1277900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1278000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1278100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1278200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1278300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1278400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1278500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1278600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1278700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1278800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1278900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1279000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1279100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1279200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1279300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1279400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1279500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1279600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1279700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1279800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1279900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1280000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0305328369140625, + "eval_runtime": 4108.9417, + "eval_samples_per_second": 273.726, + "eval_steps_per_second": 17.108, + "step": 1280000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1280100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1280200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1280300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1280400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1280500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1280600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1280700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1280800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1280900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1281000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1281100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1281200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1281300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1281400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1281500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1281600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1281700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1281800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1281900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1282000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1282100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1282200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1282300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1282400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1282500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1282600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1282700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1282800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1282900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1283000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1283100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1283200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1283300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1283400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1283500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1283600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1283700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1283800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1283900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1284000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 1284100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1284200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1284300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1284400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1284500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1284600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1284700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1284800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1284900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1285000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1285100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1285200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1285300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1285400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1285500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1285600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1285700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1285800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1285900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1286000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1286100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1286200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1286300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1286400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1286500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1286600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1286700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1286800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1286900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1287000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1287100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1287200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1287300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1287400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1287500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1287600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1287700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1287800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1287900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1288000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1288100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1288200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1288300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1288400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1288500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1288600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1288700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1288800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1288900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 1289000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1289100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1289200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1289300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1289400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1289500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1289600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1289700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1289800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1289900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1290000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1290100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1290200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1290300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1290400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1290500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1290600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1290700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1290800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1290900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1291000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1291100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1291200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1291300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1291400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1291500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1291600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1291700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1291800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1291900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1292000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1292100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1292200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1292300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1292400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1292500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1292600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1292700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1292800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1292900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1293000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1293100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1293200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1293300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1293400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1293500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1293600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1293700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1293800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1293900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1294000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1294100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1294200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1294300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1294400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1294500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1294600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1294700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1294800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1294900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1295000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1295100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1295200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1295300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1295400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1295500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1295600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1295700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1295800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1295900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1296000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1296100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1296200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1296300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1296400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1296500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1296600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1296700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1296800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1296900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1297000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1297100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1297200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1297300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1297400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1297500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1297600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1297700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1297800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1297900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1298000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1298100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1298200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1298300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1298400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1298500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1298600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1298700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1298800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1298900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1299000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1299100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1299200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1299300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1299400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1299500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1299600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1299700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1299800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1299900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1300000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0302276611328125, + "eval_runtime": 4462.333, + "eval_samples_per_second": 252.048, + "eval_steps_per_second": 15.753, + "step": 1300000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1300100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1300200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1300300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1300400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1300500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1300600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1300700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1300800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1300900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1301000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1301100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1301200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1301300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1301400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1301500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1301600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1301700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1301800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1301900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1302000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1302100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1302200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1302300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1302400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1302500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1302600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1302700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1302800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1302900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1303000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1303100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1303200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1303300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1303400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1303500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1303600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1303700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1303800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1303900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1304000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1304100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1304200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1304300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1304400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1304500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1304600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1304700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1304800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1304900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1305000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1305100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1305200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1305300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1305400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1305500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1305600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1305700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1305800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1305900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1306000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1306100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1306200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1306300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1306400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1306500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1306600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1306700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1306800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1306900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1307000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1307100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1307200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1307300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1307400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 1307500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1307600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1307700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1307800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1307900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1308000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1308100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1308200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1308300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1308400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1308500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1308600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1308700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1308800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1308900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1309000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1309100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1309200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1309300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1309400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1309500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1309600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1309700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1309800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1309900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1310000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1310100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1310200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1310300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1310400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1310500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1310600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1310700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1310800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1310900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1311000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1311100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1311200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1311300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1311400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1311500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1311600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1311700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1311800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1311900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1312000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1312100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1312200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1312300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1312400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1312500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1312600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1312700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1312800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1312900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1313000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1313100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1313200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1313300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1313400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1313500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1313600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1313700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1313800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1313900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1314000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1314100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1314200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1314300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1314400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1314500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1314600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1314700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1314800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1314900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1315000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1315100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1315200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1315300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1315400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1315500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1315600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1315700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1315800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1315900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1316000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1316100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1316200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1316300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1316400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1316500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1316600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1316700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1316800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1316900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1317000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1317100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1317200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1317300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1317400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1317500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1317600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1317700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1317800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1317900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1318000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1318100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1318200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1318300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1318400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1318500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1318600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1318700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1318800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1318900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1319000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1319100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1319200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1319300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1319400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1319500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1319600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1319700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1319800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1319900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1320000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0299835205078125, + "eval_runtime": 3928.7483, + "eval_samples_per_second": 286.28, + "eval_steps_per_second": 17.893, + "step": 1320000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1320100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1320200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1320300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1320400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1320500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1320600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1320700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1320800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1320900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1321000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1321100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1321200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1321300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1321400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1321500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1321600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1321700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1321800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1321900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1322000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1322100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1322200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1322300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1322400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1322500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1322600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1322700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1322800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1322900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1323000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1323100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1323200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1323300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1323400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1323500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1323600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1323700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1323800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1323900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1324000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1324100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1324200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1324300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1324400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1324500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1324600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1324700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1324800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1324900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1325000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1325100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1325200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1325300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1325400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1325500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1325600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1325700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1325800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1325900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1326000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1326100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1326200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1326300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1326400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1326500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1326600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1326700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1326800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1326900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1327000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1327100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1327200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1327300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1327400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1327500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1327600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1327700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1327800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1327900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1328000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1328100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1328200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1328300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1328400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1328500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1328600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1328700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1328800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1328900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1329000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1329100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1329200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1329300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1329400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1329500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1329600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1329700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1329800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1329900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1330000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1330100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1330200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1330300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1330400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1330500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1330600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1330700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1330800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1330900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1331000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1331100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1331200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1331300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1331400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1331500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1331600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1331700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1331800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1331900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1332000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1332100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1332200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1332300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1332400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1332500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1332600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1332700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1332800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1332900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1333000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1333100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1333200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1333300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1333400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1333500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1333600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1333700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1333800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1333900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1334000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1334100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1334200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1334300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1334400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1334500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1334600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1334700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1334800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1334900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1335000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1335100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1335200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1335300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1335400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1335500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1335600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1335700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1335800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1335900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1336000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1336100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1336200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1336300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1336400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1336500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1336600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1336700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1336800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1336900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1337000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1337100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1337200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1337300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1337400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1337500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1337600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1337700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1337800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1337900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1338000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1338100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1338200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1338300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1338400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1338500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1338600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1338700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1338800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1338900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1339000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1339100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1339200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1339300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1339400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1339500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1339600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1339700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1339800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1339900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1340000 + }, + { + "epoch": 0.0, + "eval_loss": 0.030059814453125, + "eval_runtime": 3710.6583, + "eval_samples_per_second": 303.106, + "eval_steps_per_second": 18.944, + "step": 1340000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1340100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1340200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1340300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1340400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1340500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1340600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1340700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1340800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1340900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1341000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1341100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1341200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1341300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1341400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1341500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1341600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1341700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1341800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1341900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1342000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1342100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1342200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1342300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1342400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1342500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1342600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1342700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1342800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1342900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1343000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1343100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1343200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1343300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1343400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1343500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1343600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1343700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1343800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1343900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1344000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1344100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1344200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1344300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1344400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1344500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1344600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1344700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1344800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1344900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1345000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1345100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1345200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1345300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1345400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1345500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1345600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1345700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1345800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1345900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1346000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1346100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1346200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1346300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1346400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1346500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1346600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1346700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1346800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1346900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1347000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1347100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1347200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1347300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1347400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1347500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1347600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1347700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1347800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1347900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1348000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1348100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1348200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1348300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1348400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1348500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1348600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1348700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1348800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1348900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1349000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1349100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1349200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1349300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1349400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1349500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1349600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1349700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1349800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1349900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1350000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1350100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1350200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1350300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1350400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1350500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1350600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1350700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1350800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1350900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1351000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1351100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1351200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1351300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1351400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1351500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1351600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1351700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1351800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1351900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1352000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1352100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1352200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1352300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1352400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1352500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1352600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1352700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1352800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1352900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1353000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1353100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1353200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1353300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1353400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1353500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1353600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1353700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1353800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1353900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1354000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1354100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1354200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0399, + "step": 1354300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1354400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0377, + "step": 1354500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1354600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1354700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1354800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1354900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1355000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1355100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1355200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1355300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1355400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1355500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1355600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1355700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1355800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1355900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1356000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1356100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1356200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1356300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1356400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1356500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1356600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1356700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1356800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1356900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1357000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1357100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1357200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1357300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1357400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1357500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1357600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1357700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1357800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1357900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1358000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1358100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1358200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1358300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1358400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1358500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1358600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1358700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1358800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1358900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1359000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1359100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1359200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1359300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1359400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1359500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1359600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1359700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1359800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1359900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1360000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0297088623046875, + "eval_runtime": 4153.2895, + "eval_samples_per_second": 270.803, + "eval_steps_per_second": 16.925, + "step": 1360000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1360100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1360200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1360300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1360400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1360500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1360600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1360700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1360800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1360900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1361000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1361100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1361200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1361300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1361400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1361500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1361600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1361700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1361800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1361900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1362000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1362100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1362200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1362300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0362, + "step": 1362400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1362500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1362600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1362700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1362800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1362900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1363000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1363100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1363200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1363300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1363400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1363500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1363600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1363700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1363800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1363900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1364000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1364100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1364200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1364300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1364400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1364500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1364600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1364700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1364800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1364900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1365000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1365100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1365200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1365300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1365400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1365500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1365600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1365700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1365800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1365900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1366000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1366100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1366200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1366300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1366400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1366500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1366600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1366700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1366800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1366900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1367000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1367100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1367200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1367300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1367400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1367500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1367600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1367700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1367800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1367900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1368000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1368100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1368200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1368300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1368400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1368500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1368600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1368700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1368800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1368900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1369000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1369100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1369200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1369300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1369400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1369500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1369600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1369700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1369800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1369900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1370000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1370100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1370200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1370300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1370400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1370500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1370600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1370700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1370800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1370900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1371000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1371100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1371200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1371300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1371400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1371500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1371600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1371700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1371800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1371900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1372000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1372100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0385, + "step": 1372200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1372300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1372400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1372500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 1372600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1372700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1372800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1372900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1373000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1373100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1373200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1373300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1373400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1373500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1373600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1373700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1373800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1373900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1374000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1374100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1374200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1374300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1374400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1374500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1374600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1374700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1374800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1374900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1375000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1375100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1375200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1375300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1375400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1375500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1375600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1375700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1375800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1375900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1376000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 1376100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1376200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1376300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1376400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1376500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1376600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1376700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1376800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1376900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1377000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1377100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1377200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1377300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1377400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1377500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1377600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1377700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1377800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1377900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1378000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1378100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1378200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1378300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 1378400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1378500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1378600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1378700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1378800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1378900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1379000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1379100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1379200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1379300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1379400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1379500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1379600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1379700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1379800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1379900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1380000 + }, + { + "epoch": 0.0, + "eval_loss": 0.030059814453125, + "eval_runtime": 4082.9937, + "eval_samples_per_second": 275.465, + "eval_steps_per_second": 17.217, + "step": 1380000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1380100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1380200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1380300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1380400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1380500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1380600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1380700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1380800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1380900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1381000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1381100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1381200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1381300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 1381400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1381500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1381600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1381700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1381800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1381900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1382000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1382100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1382200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1382300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1382400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1382500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1382600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1382700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1382800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1382900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1383000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1383100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1383200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1383300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1383400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1383500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1383600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1383700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1383800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1383900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1384100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1384200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1384300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1384400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1384500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1384600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1384700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1384800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1384900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1385000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1385100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1385200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1385300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1385400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1385500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1385600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1385700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1385800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1385900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1386000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1386100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1386200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1386300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1386400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1386500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1386600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1386700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1386800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1386900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1387000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1387100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1387200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1387300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1387400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1387500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1387600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1387700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1387800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1387900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1388000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1388100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1388200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1388300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1388400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1388500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1388600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1388700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1388800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1388900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1389000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1389100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1389200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1389300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1389400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1389500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1389600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1389700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1389800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1389900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1390000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1390100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1390200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1390300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1390400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1390500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1390600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1390700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1390800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1390900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1391000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1391100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1391200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1391300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1391400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1391500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1391600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1391700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1391800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1391900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1392000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1392100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1392200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1392300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1392400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1392500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1392600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1392700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1392800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1392900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1393000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1393100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1393200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1393300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1393400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1393500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1393600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1393700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1393800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1393900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1394000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1394100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1394200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1394300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1394400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1394500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1394600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1394700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1394800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1394900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1395000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1395100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1395200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1395300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1395400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1395500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1395600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1395700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1395800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1395900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1396000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1396100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1396200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1396300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1396400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1396500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1396600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1396700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1396800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1396900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1397000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1397100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1397200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1397300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1397400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1397500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1397600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1397700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1397800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1397900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1398000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1398100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1398200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1398300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1398400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1398500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1398600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1398700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1398800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1398900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1399000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1399100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1399200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1399300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1399400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1399500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1399600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1399700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1399800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1399900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1400000 + }, + { + "epoch": 0.0, + "eval_loss": 0.029754638671875, + "eval_runtime": 3458.2901, + "eval_samples_per_second": 325.225, + "eval_steps_per_second": 20.327, + "step": 1400000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1400100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1400200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1400300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1400400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1400500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1400600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1400700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1400800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1400900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1401000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1401100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1401200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1401300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1401400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1401500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1401600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1401700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1401800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1401900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1402000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1402100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1402200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1402300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1402400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1402500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1402600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1402700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1402800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1402900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1403000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1403100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1403200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1403300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1403400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1403500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1403600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1403700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1403800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1403900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1404000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1404100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1404200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1404300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1404400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1404500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1404600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1404700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1404800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1404900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1405000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1405100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1405200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1405300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1405400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1405500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1405600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1405700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1405800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1405900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1406000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1406100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1406200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1406300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1406400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1406500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1406600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1406700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1406800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1406900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1407000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1407100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1407200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1407300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1407400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1407500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1407600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1407700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1407800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1407900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1408000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1408100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1408200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1408300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1408400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1408500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1408600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1408700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1408800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1408900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1409000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1409100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1409200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1409300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1409400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1409500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1409600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1409700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1409800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1409900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1410000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1410100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1410200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1410300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1410400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1410500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1410600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1410700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1410800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1410900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1411000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1411100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1411200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1411300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1411400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1411500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1411600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1411700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1411800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1411900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1412000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1412100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1412200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1412300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1412400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1412500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1412600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1412700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1412800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1412900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1413000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1413100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1413200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1413300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1413400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1413500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1413600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1413700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1413800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1413900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1414000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1414100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1414200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1414300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1414400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1414500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1414600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1414700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1414800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1414900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1415000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1415100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1415200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1415300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1415400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1415500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1415600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1415700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1415800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1415900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1416000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1416100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1416200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1416300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1416400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1416500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1416600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1416700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1416800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1416900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1417000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1417100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1417200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1417300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1417400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1417500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1417600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1417700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1417800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1417900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1418000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1418100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1418200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1418300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1418400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1418500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1418600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1418700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1418800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1418900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1419000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1419100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1419200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1419300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1419400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1419500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1419600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1419700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1419800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1419900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1420000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02960205078125, + "eval_runtime": 3232.2323, + "eval_samples_per_second": 347.971, + "eval_steps_per_second": 21.748, + "step": 1420000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1420100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1420200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1420300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1420400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1420500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1420600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1420700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1420800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1420900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1421000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1421100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1421200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1421300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1421400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1421500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1421600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1421700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1421800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1421900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1422000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1422100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1422200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1422300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1422400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1422500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1422600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1422700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1422800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1422900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1423000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1423100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1423200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1423300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1423400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1423500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1423600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1423700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1423800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1423900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1424000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1424100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1424200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1424300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1424400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1424500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1424600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1424700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1424800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1424900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1425000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1425100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1425200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1425300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1425400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1425500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1425600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1425700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1425800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1425900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1426000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1426100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1426200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1426300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1426400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1426500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1426600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1426700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1426800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1426900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1427000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1427100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1427200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1427300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1427400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1427500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1427600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1427700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1427800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1427900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1428000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1428100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1428200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1428300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1428400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1428500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1428600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1428700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1428800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1428900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1429000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1429100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1429200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1429300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1429400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1429500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1429600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1429700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1429800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1429900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1430000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1430100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1430200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1430300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1430400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1430500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1430600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1430700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1430800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1430900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1431000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1431100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1431200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1431300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1431400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1431500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1431600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1431700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1431800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1431900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1432000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1432100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1432200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1432300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1432400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1432500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1432600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1432700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1432800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1432900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1433000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1433100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1433200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1433300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1433400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1433500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1433600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1433700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1433800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1433900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1434000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1434100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1434200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1434300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1434400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1434500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1434600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1434700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1434800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1434900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1435000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1435100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1435200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1435300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1435400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1435500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1435600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1435700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1435800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1435900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1436000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1436100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1436200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1436300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1436400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1436500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1436600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1436700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1436800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1436900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1437000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1437100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1437200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1437300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1437400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1437500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1437600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1437700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1437800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1437900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1438000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1438100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1438200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1438300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1438400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1438500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1438600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1438700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1438800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1438900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1439000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1439100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1439200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1439300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1439400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1439500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1439600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1439700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1439800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1439900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1440000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0299072265625, + "eval_runtime": 3252.8152, + "eval_samples_per_second": 345.769, + "eval_steps_per_second": 21.611, + "step": 1440000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1440100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1440200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1440300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1440400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1440500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1440600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1440700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1440800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1440900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1441000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1441100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1441200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1441300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1441400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1441500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1441600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1441700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1441800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1441900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1442000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1442100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1442200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1442300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1442400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1442500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1442600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1442700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1442800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1442900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1443000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1443100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1443200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1443300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1443400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1443500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1443600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1443700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1443800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1443900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1444000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1444100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1444200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1444300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1444400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1444500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1444600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1444700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1444800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1444900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1445000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1445100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1445200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1445300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1445400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1445500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1445600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1445700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1445800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1445900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1446000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1446100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1446200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1446300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1446400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1446500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1446600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1446700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1446800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1446900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1447000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1447100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1447200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1447300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1447400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1447500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1447600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1447700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1447800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1447900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1448000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1448100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1448200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1448300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1448400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1448500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1448600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1448700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1448800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1448900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1449000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1449100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1449200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1449300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1449400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1449500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1449600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1449700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1449800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1449900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1450000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1450100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1450200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1450300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1450400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1450500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1450600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1450700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1450800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1450900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1451000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1451100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1451200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1451300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1451400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1451500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1451600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1451700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1451800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1451900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1452000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1452100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1452200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1452300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1452400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1452500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1452600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1452700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1452800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1452900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1453000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1453100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1453200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1453300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1453400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1453500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1453600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1453700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1453800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1453900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1454000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1454100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1454200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1454300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1454400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1454500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1454600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1454700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1454800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1454900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1455000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1455100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1455200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1455300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1455400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1455500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1455600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1455700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1455800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1455900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1456000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1456100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1456200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1456300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1456400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1456500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1456600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1456700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1456800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1456900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1457000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1457100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1457200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1457300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1457400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1457500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1457600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1457700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1457800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1457900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1458000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1458100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1458200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1458300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1458400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1458500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1458600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1458700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1458800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1458900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1459000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1459100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1459200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1459300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1459400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1459500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1459600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1459700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1459800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1459900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1460000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0298309326171875, + "eval_runtime": 3154.1525, + "eval_samples_per_second": 356.585, + "eval_steps_per_second": 22.287, + "step": 1460000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1460100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1460200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1460300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1460400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1460500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1460600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1460700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1460800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1460900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1461000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1461100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1461200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1461300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1461400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1461500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1461600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1461700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1461800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1461900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1462000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1462100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1462200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1462300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1462400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1462500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1462600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1462700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1462800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1462900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1463000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1463100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1463200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1463300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1463400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1463500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1463600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1463700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1463800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1463900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1464000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1464100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1464200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1464300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1464400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1464500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1464600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1464700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1464800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1464900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1465000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1465100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1465200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1465300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1465400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1465500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 1465600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1465700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1465800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1465900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1466000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1466100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1466200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1466300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1466400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1466500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1466600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1466700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1466800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1466900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1467000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1467100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1467200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1467300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1467400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1467500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1467600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1467700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1467800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1467900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1468000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1468100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1468200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1468300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1468400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1468500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1468600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1468700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1468900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1469000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1469100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1469200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1469300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1469400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1469500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1469600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1469700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1469800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1469900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1470000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1470100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1470200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1470300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1470400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1470500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1470600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1470700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1470800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1470900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1471000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1471100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1471200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1471300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1471400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1471500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1471600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1471700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1471800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1471900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1472000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1472100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1472200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1472300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1472400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1472500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1472600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1472700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1472800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1472900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1473000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1473100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1473200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1473300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1473400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1473500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1473600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1473700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1473800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1473900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1474000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1474100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1474200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1474300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1474400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1474500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1474600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1474700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1474800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1474900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1475000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1475100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1475200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1475300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1475400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1475500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1475600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1475700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1475800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1475900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1476000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1476100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1476200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1476300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1476400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1476500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1476600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1476700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1476800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1476900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1477000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1477100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1477200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1477300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1477400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1477500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1477600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1477700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1477800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1477900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1478000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1478100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1478200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1478300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1478400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1478500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1478600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1478700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1478800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1478900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1479000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1479100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1479200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1479300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1479400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1479500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1479600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1479700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1479800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1479900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1480000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0298309326171875, + "eval_runtime": 3118.3877, + "eval_samples_per_second": 360.675, + "eval_steps_per_second": 22.542, + "step": 1480000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1480100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1480200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1480300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1480400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1480500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 1480600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1480700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1480800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1480900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1481000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1481100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1481200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1481300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1481400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1481500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1481600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1481700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1481800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1481900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1482000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1482100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1482200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1482300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1482400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1482500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1482600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1482700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1482800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1482900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1483000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1483100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1483200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1483300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1483400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1483500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1483600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1483700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1483800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1483900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1484000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1484100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1484200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1484300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1484400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1484500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1484600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1484700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1484800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1484900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1485000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1485100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1485200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1485300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1485400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1485500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1485600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1485700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1485800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1485900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1486000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1486100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1486200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1486300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1486400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1486500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1486600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1486700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1486800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1486900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1487000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1487100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1487200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1487300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1487400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1487500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1487600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1487700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1487800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1487900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1488000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1488100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1488200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1488300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1488400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1488500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1488600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1488700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1488800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1488900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1489000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1489100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1489200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1489300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1489400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1489500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1489600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1489700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1489800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1489900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1490000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1490100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1490200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1490300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1490400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1490500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1490600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1490700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1490800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1490900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1491000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1491100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1491200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1491300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1491400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1491500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1491600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1491700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1491800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1491900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1492000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1492100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1492200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1492300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1492400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1492500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1492600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1492700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1492800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1492900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1493000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1493100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1493200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1493300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1493400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1493500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1493600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1493700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1493800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1493900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1494000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1494100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1494200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1494300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1494400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1494500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1494600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1494700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1494800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1494900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1495000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1495100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1495200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1495300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1495400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1495500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1495600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1495700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1495800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1495900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1496000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1496100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1496200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1496300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1496400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1496500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1496600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1496700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1496800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1496900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1497000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1497100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1497200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1497300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1497400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1497500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1497600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1497700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1497800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1497900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1498000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1498100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1498200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1498300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1498400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1498500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1498600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1498700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1498800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1498900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1499000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1499100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1499200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1499300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1499400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1499500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1499600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1499700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1499800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1499900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1500000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0298919677734375, + "eval_runtime": 3292.6728, + "eval_samples_per_second": 341.584, + "eval_steps_per_second": 21.349, + "step": 1500000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1500100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1500200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1500300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1500400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1500500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1500600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1500700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1500800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1500900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1501000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1501100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1501200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1501300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1501400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1501500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1501600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1501700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1501800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1501900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1502000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1502100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1502200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1502300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1502400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1502500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1502600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1502700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1502800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1502900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1503000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1503100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1503200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1503300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1503400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1503500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1503600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1503700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1503800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1503900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1504000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1504100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1504200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1504300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1504400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1504500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1504600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1504700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1504800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1504900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1505000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1505100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1505200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1505300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1505400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1505500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1505600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1505700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1505800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1505900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1506000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1506100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1506200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1506300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1506400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1506500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1506600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1506700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1506800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1506900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1507000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1507100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1507200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1507300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1507400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1507500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1507600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1507700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1507800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1507900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1508000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1508100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1508200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1508300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1508400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1508500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1508600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1508700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1508800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1508900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1509000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1509100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1509200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1509300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1509400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1509500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1509600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1509700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1509800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1509900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1510000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1510100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1510200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1510300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1510400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1510500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1510600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1510700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1510800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1510900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1511000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1511100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1511200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1511300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1511400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1511500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1511600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1511700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1511800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1511900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1512000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1512100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1512200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1512300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1512400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1512500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1512600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1512700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1512800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1512900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1513000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1513100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1513200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1513300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1513400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1513500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1513600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1513700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1513800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1513900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1514000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1514100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1514200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1514300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1514400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1514500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1514600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1514700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1514800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1514900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1515000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1515100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1515200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1515300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1515400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1515500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1515600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1515700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1515800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1515900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1516000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1516100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1516200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1516300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1516400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1516500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1516600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1516700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1516800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1516900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1517000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1517100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1517200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1517300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1517400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1517500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1517600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1517700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1517800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1517900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1518000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1518100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1518200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1518300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1518400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1518500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1518600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1518700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1518800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1518900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1519000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1519100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1519200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1519300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1519400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1519500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1519600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1519700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1519800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1519900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1520000 + }, + { + "epoch": 0.0, + "eval_loss": 0.029541015625, + "eval_runtime": 3474.5692, + "eval_samples_per_second": 323.701, + "eval_steps_per_second": 20.232, + "step": 1520000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1520100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1520200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1520300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1520400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1520500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1520600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1520700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1520800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1520900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1521000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1521100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1521200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1521300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1521400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1521500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1521600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1521700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1521800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1521900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1522000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1522100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1522200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1522300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1522400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1522500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1522600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1522700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1522800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1522900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1523000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1523100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1523200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1523300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1523400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1523500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1523600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1523700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1523800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1523900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1524000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1524100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1524200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1524300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1524400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1524500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1524600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1524700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1524800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1524900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1525000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1525100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1525200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1525300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1525400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1525500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1525600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1525700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1525800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1525900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1526000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1526100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1526200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1526300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1526400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1526500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1526600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1526700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1526800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1526900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1527000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1527100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1527200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1527300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1527400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1527500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1527600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1527700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1527800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1527900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1528000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1528100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1528200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1528300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1528400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1528500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1528600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1528700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1528800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1528900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1529000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1529100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1529200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1529300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1529400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1529500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1529600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1529700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1529800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1529900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1530000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1530100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1530200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1530300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1530400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1530500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1530600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1530700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1530800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1530900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1531000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1531100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1531200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1531300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1531400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1531500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1531600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1531700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1531800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1531900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1532000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1532100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1532200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1532300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1532400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1532500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1532600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1532700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1532800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1532900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1533000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1533100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1533200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1533300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1533400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1533500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1533600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1533700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1533800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1533900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1534000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1534100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1534200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1534300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1534400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1534500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1534600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1534700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1534800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1534900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1535000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1535100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1535200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1535300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1535400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1535500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1535600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1535700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1535800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1535900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1536000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1536100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1536200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1536300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1536400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1536500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1536600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1536700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1536800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1536900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1537000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1537100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1537200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1537300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1537400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1537500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1537600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1537700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1537800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1537900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1538000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1538100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1538200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1538300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1538400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1538500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1538600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1538700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1538800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1538900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1539000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1539100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1539200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1539300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1539400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1539500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1539600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1539700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1539800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1539900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1540000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0293731689453125, + "eval_runtime": 3714.1825, + "eval_samples_per_second": 302.818, + "eval_steps_per_second": 18.926, + "step": 1540000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1540100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1540200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1540300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1540400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1540500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1540600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1540700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1540800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1540900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1541000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1541100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1541200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1541300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1541400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1541500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1541600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1541700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1541800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1541900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1542000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1542100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1542200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1542300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1542400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1542500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1542600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1542700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1542800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1542900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1543000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1543100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1543200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1543300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1543400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1543500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1543600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1543700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1543800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1543900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1544000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1544100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1544200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1544300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1544400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1544500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1544600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1544700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1544800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1544900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1545000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1545100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1545200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1545300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1545400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1545500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1545600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1545700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1545800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1545900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1546000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1546100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1546200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1546300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1546400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1546500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1546600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1546700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1546800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1546900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1547000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1547100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1547200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1547300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1547400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1547500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1547600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1547700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1547800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1547900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1548000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1548100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1548200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1548300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1548400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1548500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 1548600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1548700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1548800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1548900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1549000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1549100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1549200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1549300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1549400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1549500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1549600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1549700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1549800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1549900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1550000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1550100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1550200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1550300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1550400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1550500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1550600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1550700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1550800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1550900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1551000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1551100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1551200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1551300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1551400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1551500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1551600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1551700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1551800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1551900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1552000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1552100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1552200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1552300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1552400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1552500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1552600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1552700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1552800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1552900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1553000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1553100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1553200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1553300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1553400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1553500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1553600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1553700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1553800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1553900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1554000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1554100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 1554200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1554300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1554400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1554500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1554600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1554700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1554800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1554900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1555000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1555100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1555200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1555300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1555400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1555500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1555600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1555700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1555800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1555900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1556000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1556100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1556200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1556300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1556400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1556500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1556600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1556700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1556800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1556900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1557000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1557100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1557200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1557300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1557400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1557500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1557600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1557700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1557800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1557900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1558000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1558100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1558200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1558300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1558400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1558500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1558600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1558700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1558800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1558900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1559000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1559100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1559200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1559300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1559400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1559500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1559600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1559700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1559800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1559900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1560000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0294952392578125, + "eval_runtime": 4014.0505, + "eval_samples_per_second": 280.197, + "eval_steps_per_second": 17.512, + "step": 1560000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1560100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1560200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1560300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1560400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1560500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1560600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1560700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1560800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1560900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1561000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1561100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1561200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1561300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1561400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1561500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1561600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1561700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1561800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1561900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1562000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1562100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1562200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1562300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1562400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1562500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1562600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1562700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1562800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1562900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1563000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1563100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1563200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1563300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1563400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1563500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1563600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1563700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1563800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1563900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1564000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1564100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1564200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1564300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1564400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1564500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1564600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1564700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1564800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1564900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1565000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1565100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1565200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1565300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1565400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1565500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1565600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1565700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1565800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1565900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1566000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1566100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1566200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1566300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 1566400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1566500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1566600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1566700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1566800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1566900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1567000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1567100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1567200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1567300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1567400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1567500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1567600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1567700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1567800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1567900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1568000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1568100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1568200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1568300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1568400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1568500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1568600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1568700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1568800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1568900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1569000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1569100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1569200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1569300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1569400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1569500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1569600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 1569700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1569800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1569900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1570000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1570100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1570200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1570300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1570400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1570500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1570600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1570700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1570800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1570900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1571000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1571100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1571200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1571300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1571400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1571500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1571600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1571700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1571800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1571900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1572000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1572100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1572200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1572300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1572400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1572500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1572600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1572700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1572800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1572900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1573000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1573100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1573200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1573300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1573400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1573500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1573600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1573700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1573800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1573900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1574000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1574100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1574200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1574300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1574400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1574500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1574600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1574700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1574800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1574900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1575000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1575100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1575200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1575300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1575400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1575500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1575600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1575700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1575800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1575900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1576000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1576100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1576200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1576300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1576400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1576500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1576600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1576700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1576800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1576900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1577000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1577100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1577200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1577300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1577400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1577500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1577600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1577700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1577800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1577900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1578000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1578100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1578200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1578300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1578400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1578500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1578600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1578700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1578800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1578900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1579000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1579100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1579200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1579300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1579400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1579500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1579600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1579700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1579800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1579900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1580000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0294647216796875, + "eval_runtime": 4066.6786, + "eval_samples_per_second": 276.57, + "eval_steps_per_second": 17.286, + "step": 1580000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1580100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1580200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1580300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1580400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 1580500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1580600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1580700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1580800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1580900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1581000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1581100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1581200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1581300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1581400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1581500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1581600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1581700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1581800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1581900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1582000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1582100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1582200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1582300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1582400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1582500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1582600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1582700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1582800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1582900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1583000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1583100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1583200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1583300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1583400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1583500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1583600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1583700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1583800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1583900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1584000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1584100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1584200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1584300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1584400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1584500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1584600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1584700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1584800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1584900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1585000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1585100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1585200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1585300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 1585400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1585500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1585600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1585700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1585800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1585900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1586000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1586100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1586200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1586300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1586400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1586500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1586600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1586700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1586800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1586900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1587000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1587100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1587200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1587300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1587400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1587500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1587600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1587700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1587800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1587900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1588000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1588100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1588200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1588300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1588400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1588500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1588600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1588700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1588800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1588900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1589000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1589100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1589200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1589300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1589400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1589500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1589600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1589700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1589800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1589900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1590000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1590100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1590200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1590300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1590400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1590500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1590600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1590700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1590800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1590900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1591000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1591100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1591200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1591300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1591400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1591500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1591600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1591700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1591800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1591900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1592000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1592100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1592200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1592300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1592400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1592500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1592600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1592700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1592800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1592900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1593000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1593100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1593200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1593300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1593400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1593500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1593600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1593700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1593800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1593900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1594000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1594100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1594200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1594300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1594400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1594500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1594600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1594700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1594800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1594900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1595000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1595100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1595200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1595300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1595400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1595500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1595600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1595700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1595800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1595900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1596000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1596100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1596200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1596300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1596400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1596500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1596600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1596700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1596800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1596900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1597000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1597100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1597200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1597300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1597400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1597500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1597600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1597700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1597800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1597900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1598000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1598100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1598200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1598300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1598400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1598500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1598600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1598700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1598800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1598900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1599000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1599100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1599200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1599300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1599400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1599500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1599600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1599700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1599800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1599900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1600000 + }, + { + "epoch": 0.0, + "eval_loss": 0.029632568359375, + "eval_runtime": 3871.4209, + "eval_samples_per_second": 290.519, + "eval_steps_per_second": 18.158, + "step": 1600000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1600100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1600200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1600300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1600400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1600500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1600600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1600700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1600800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1600900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1601000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1601100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1601200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1601300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1601400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1601500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1601600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1601700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1601800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1601900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1602000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1602100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1602200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1602300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1602400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1602500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1602600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1602700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1602800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1602900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1603000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1603100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1603200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1603300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1603400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1603500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1603600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1603700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1603800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1603900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1604000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1604100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1604200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1604300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1604400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1604500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1604600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1604700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1604800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1604900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1605000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1605100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1605200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1605300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1605400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1605500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1605600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1605700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1605800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1605900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1606000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1606100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1606200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1606300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1606400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1606500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1606600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0473, + "step": 1606700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1606800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 1606900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1607000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0396, + "step": 1607100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1607200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1607300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1607400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1607500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1607600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1607700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1607800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1607900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1608000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1608100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1608200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1608300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1608400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1608500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1608600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1608700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1608800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1608900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1609000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1609100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1609200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1609300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1609400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1609500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1609600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1609700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1609800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1609900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1610000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1610100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1610200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1610300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1610400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1610500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1610600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1610700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1610800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1610900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1611000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1611100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1611200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1611300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1611400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1611500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1611600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1611700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1611800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1611900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1612000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1612100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1612200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1612300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1612400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1612500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1612600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1612700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1612800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1612900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1613000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1613100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1613200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0468, + "step": 1613300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1613400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1613500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1613600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1613700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1613800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1613900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1614000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1614100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1614200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1614300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1614400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1614500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1614600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1614700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1614800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1614900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1615000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1615100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1615200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1615300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1615400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1615500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1615600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1615700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1615800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1615900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1616000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1616100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1616200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1616300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1616400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1616500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1616600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1616700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1616800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1616900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1617000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1617100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1617200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1617300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1617400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1617500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1617600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1617700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1617800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1617900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1618000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1618100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1618200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1618300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1618400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1618500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1618600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1618700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1618800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1618900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1619000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1619100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1619200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1619300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1619400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1619500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1619600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1619700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1619800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1619900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1620000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0294647216796875, + "eval_runtime": 4096.2178, + "eval_samples_per_second": 274.576, + "eval_steps_per_second": 17.161, + "step": 1620000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1620100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1620200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1620300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1620400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1620500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1620600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1620700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1620800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1620900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1621000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1621100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1621200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1621300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1621400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1621500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1621600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1621700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1621800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1621900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1622000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1622100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1622200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1622300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1622400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1622500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1622600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1622700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1622800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1622900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1623000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1623100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1623200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1623300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1623400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1623500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1623600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1623700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1623800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1623900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1624000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1624100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1624200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1624300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1624400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1624500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1624600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1624700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1624800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1624900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1625000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1625100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1625200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1625300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1625400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1625500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1625600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1625700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1625800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1625900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1626000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1626100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1626200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1626300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1626400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1626500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1626600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1626700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1626800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1626900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1627000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1627100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1627200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1627300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1627400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1627500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1627600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1627700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1627800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1627900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1628000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1628100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1628200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1628300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1628400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1628500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1628600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1628700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1628800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1628900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1629000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1629100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1629200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1629300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1629400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1629500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1629600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1629700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1629800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1629900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1630000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1630100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1630200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1630300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1630400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1630500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1630600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1630700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1630800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1630900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1631000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1631100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1631200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1631300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1631400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1631500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1631600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1631700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1631800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1631900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1632000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1632100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1632200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1632300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1632400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1632500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1632600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1632700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1632800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1632900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1633000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1633100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1633200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1633300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1633400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1633500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1633600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1633700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1633800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1633900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1634000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1634100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1634200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1634300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1634400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1634500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1634600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1634700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1634800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1634900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1635000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1635100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1635200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1635300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1635400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1635500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1635600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1635700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1635800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1635900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1636000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1636100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1636200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1636300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1636400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1636500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1636600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1636700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1636800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1636900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1637000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1637100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1637200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1637300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1637400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1637500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1637600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1637700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1637800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1637900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1638000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1638100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1638200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1638300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1638500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1638600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1638700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1638800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1638900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1639000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1639100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1639200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1639300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1639400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1639500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1639600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1639700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1639800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1639900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1640000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0295257568359375, + "eval_runtime": 4142.3589, + "eval_samples_per_second": 271.518, + "eval_steps_per_second": 16.97, + "step": 1640000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1640100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1640200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1640300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1640400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1640500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1640600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1640700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1640800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1640900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1641000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1641100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1641200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1641300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1641400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1641500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1641600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1641700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1641800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1641900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1642000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1642100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1642200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1642300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1642400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1642500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1642600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1642700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1642800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1642900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1643000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1643100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1643200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1643300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1643400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1643500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1643600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1643700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1643800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1643900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1644000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1644100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1644200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1644300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1644400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1644500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1644600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1644700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1644800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1644900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1645000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1645100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1645200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1645300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1645400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1645500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1645600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1645700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1645800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1645900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1646000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1646100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1646200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1646300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1646400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1646500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1646600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1646700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1646800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1646900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1647000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1647100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1647200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1647300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1647400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1647500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1647600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1647700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1647800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1647900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1648000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1648100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1648200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1648300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1648400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1648500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1648600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1648700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1648800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1648900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1649000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1649100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1649200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1649300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1649400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1649500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1649600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1649700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1649800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1649900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1650000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1650100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1650200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1650300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1650400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1650500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1650600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1650700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1650800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1650900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1651000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1651100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1651200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1651300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1651400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1651500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1651600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1651700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1651800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1651900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1652000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1652100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1652200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1652300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1652400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1652500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1652600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1652700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1652800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1652900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1653000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1653100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1653200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1653300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1653400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1653500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1653600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1653700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1653800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1653900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1654000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1654100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1654200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1654300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1654400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1654500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1654600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1654700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1654800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1654900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1655000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1655100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1655200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1655300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1655400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1655500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1655600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1655700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1655800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1655900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1656000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1656100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1656200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1656300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1656400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1656500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1656600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1656700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1656800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 1656900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1657000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1657100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1657200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1657300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1657400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1657500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1657600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1657700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1657800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1657900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1658000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1658100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1658200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1658300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1658400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1658500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1658600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1658700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1658800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1658900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1659000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1659100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1659200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1659300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1659400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1659500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1659600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1659700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1659800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1659900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1660000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0294342041015625, + "eval_runtime": 3159.0731, + "eval_samples_per_second": 356.029, + "eval_steps_per_second": 22.252, + "step": 1660000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1660100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1660200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1660300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1660400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1660500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1660600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1660700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1660800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1660900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1661000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1661100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1661200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1661300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1661400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1661500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1661600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1661700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1661800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1661900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1662000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1662100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1662200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1662300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1662400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1662500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1662600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1662700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1662800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1662900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1663000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1663100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1663200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1663300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1663400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1663500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1663600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1663700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1663800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1663900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1664000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1664100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1664200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1664300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1664400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1664500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1664600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1664700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1664800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1664900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1665000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1665100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1665200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1665300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1665400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1665500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1665600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1665700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1665800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1665900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1666000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0538, + "step": 1666100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1666200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1666300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1666400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1666500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1666600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1666700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1666800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1666900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1667000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1667100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1667200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1667300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1667400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1667500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1667600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1667700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1667800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1667900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1668000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1668100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1668200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1668300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1668400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1668500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1668600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1668700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1668800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1668900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1669000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1669100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1669200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1669300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1669400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1669500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1669600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1669700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1669800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1669900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1670000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1670100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1670200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1670300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1670400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1670500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1670600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1670700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1670800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1670900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1671000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1671100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1671200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1671300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1671400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1671500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1671600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1671700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1671800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1671900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1672000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1672100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1672200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1672300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1672400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1672500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1672600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1672700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1672800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1672900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1673000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1673100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1673200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1673300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1673400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1673500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1673600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1673700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1673800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1673900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1674000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1674100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1674200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1674300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1674400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1674500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1674600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1674700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1674800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1674900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1675000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1675100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1675200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1675300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1675400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 1675500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1675600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1675700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1675800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1675900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1676000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1676100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1676200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1676300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1676400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1676500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1676600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1676700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1676800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1676900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1677000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1677100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1677200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1677300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1677400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1677500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1677600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1677700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1677800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1677900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1678000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1678100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1678200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1678300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1678400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1678500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1678600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1678700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1678800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1678900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1679000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1679100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1679200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1679300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1679400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1679500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1679600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1679700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1679800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1679900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1680000 + }, + { + "epoch": 0.0, + "eval_loss": 0.029327392578125, + "eval_runtime": 3073.5891, + "eval_samples_per_second": 365.931, + "eval_steps_per_second": 22.871, + "step": 1680000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1680100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1680200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1680300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1680400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1680500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1680600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1680700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1680800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1680900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1681000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1681100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1681200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1681300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1681400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1681500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1681600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1681700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1681800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1681900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1682000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1682100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1682200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1682300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1682400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1682500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1682600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1682700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1682800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1682900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1683000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1683100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1683200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1683300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1683400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1683500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1683600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1683700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1683800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1683900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1684000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1684100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1684200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1684300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1684400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1684500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1684600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1684700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1684800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1684900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1685000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1685100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1685200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1685300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1685400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1685500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1685600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1685700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1685800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1685900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1686000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1686100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1686200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1686300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1686400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1686500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1686600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1686700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1686800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1686900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1687000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1687100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1687200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 1687300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1687400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1687500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1687600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1687700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1687800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1687900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1688000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1688100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1688200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1688300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1688400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1688500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1688600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1688700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1688800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1688900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1689000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1689100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1689200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1689300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1689400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1689500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1689600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1689700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1689800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1689900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1690000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1690100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1690200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1690300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1690400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1690500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1690600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1690700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1690800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1690900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1691000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1691100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1691200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1691300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1691400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1691500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1691600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1691700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1691800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1691900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1692000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1692100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 1692200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1692300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1692400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1692500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1692600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1692700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1692800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1692900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1693000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1693100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1693200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1693300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1693400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1693500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1693600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1693700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1693800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1693900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1694000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1694100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1694200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1694300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1694400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1694500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1694600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1694700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1694800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1694900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1695000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1695100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1695200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1695300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1695400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1695500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1695600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1695700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1695800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1695900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1696000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1696100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1696200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1696300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1696400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1696500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1696600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1696700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1696800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1696900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1697000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1697100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1697200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1697300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1697400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1697500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1697600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1697700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1697800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1697900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1698000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1698100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1698200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1698300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1698400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1698500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1698600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1698700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1698800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1698900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1699000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1699100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1699200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1699300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1699400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1699500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1699600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1699700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1699800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1699900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1700000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0290679931640625, + "eval_runtime": 3020.7946, + "eval_samples_per_second": 372.327, + "eval_steps_per_second": 23.271, + "step": 1700000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1700100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1700200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1700300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1700400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1700500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1700600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1700700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1700800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1700900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1701000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1701100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1701200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1701300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1701400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1701500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1701600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1701700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1701800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1701900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1702000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1702100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1702200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1702300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1702400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1702500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1702600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1702700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1702800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1702900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1703000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1703100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1703200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1703300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1703400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1703500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1703600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1703700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1703800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1703900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1704000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1704100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1704200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1704300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1704400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1704500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1704600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1704700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1704800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1704900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1705000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1705100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1705200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1705300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1705400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1705500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1705600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1705700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1705800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1705900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1706000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1706100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1706200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1706300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1706400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1706500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1706600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1706700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1706800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1706900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1707000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1707100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1707200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1707300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1707400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1707500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1707600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1707700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1707800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1707900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1708000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1708100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1708200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1708300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1708400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1708500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1708600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1708700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1708800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1708900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1709000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1709100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1709200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1709300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1709400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1709500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1709600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1709700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1709800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1709900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1710000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1710100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1710200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1710300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1710400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1710500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1710600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1710700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1710800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1710900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1711000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1711100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1711200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1711300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1711400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1711500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1711600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1711700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1711800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1711900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1712000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1712100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1712200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1712300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1712400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1712500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 1712600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1712700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1712800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1712900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1713000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1713100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1713200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1713300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1713400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1713500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1713600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1713700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1713800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1713900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1714000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1714100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1714200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1714300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1714400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1714500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1714600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1714700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1714800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1714900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1715000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1715100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1715200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1715300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1715400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1715500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1715600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1715700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1715800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1715900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1716000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1716100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1716200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1716300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1716400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1716500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1716600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1716700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1716800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1716900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1717000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1717100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1717200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1717300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1717400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1717500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1717600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1717700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 1717800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1717900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1718000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1718100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1718200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1718300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1718400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1718500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1718600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1718700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1718800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1718900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1719000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1719100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1719200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1719300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1719400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1719500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1719600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1719700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1719800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1719900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1720000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0290069580078125, + "eval_runtime": 3004.4195, + "eval_samples_per_second": 374.356, + "eval_steps_per_second": 23.398, + "step": 1720000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1720100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1720200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1720300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1720400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1720500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1720600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1720700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1720800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1720900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1721000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1721100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1721200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1721300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1721400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1721500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1721600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1721700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1721800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1721900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1722000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1722100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1722200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1722300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1722400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1722500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1722600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1722700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1722800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1722900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1723000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1723100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1723200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1723300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1723400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1723500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1723600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1723700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1723800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1723900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1724000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1724100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1724200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1724300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1724400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1724500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1724600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1724700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1724800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1724900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1725000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1725100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1725200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1725300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1725400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1725500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1725600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1725700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1725800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1725900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1726000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1726100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1726200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1726300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1726400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1726500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1726600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1726700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1726800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1726900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1727000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1727100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1727200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1727300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1727400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1727500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1727600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1727700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1727800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1727900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 1728000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1728100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1728200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1728300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1728400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1728500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1728600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1728700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1728800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1728900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1729000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1729100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1729200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1729300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1729400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1729500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1729600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1729700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1729800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1729900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1730000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1730100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1730200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1730300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1730400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1730500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1730600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1730700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1730800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1730900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1731000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1731100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1731200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1731300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1731400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1731500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1731600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1731700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1731800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1731900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1732000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1732100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1732200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1732300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1732400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1732500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1732600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1732700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1732800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1732900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1733000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1733100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1733200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1733300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1733400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1733500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1733600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1733700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1733800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1733900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1734000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1734100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1734200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1734300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1734400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1734500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1734600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1734700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1734800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1734900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1735000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1735100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1735200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1735300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1735400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1735500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1735600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1735700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1735800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1735900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1736000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1736100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1736200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1736300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1736400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1736500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1736600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1736700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1736800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1736900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1737000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1737100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1737200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1737300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1737400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1737500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1737600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1737700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1737800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1737900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1738000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1738100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1738200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1738300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1738400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1738500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1738600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1738700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1738800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1738900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1739000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1739100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1739200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1739300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1739400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1739500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1739600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1739700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1739800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1739900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1740000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0287322998046875, + "eval_runtime": 3010.2667, + "eval_samples_per_second": 373.629, + "eval_steps_per_second": 23.352, + "step": 1740000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1740100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1740200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1740300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1740400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 1740500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1740600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1740700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1740800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1740900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1741000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1741100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1741200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1741300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1741400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1741500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1741600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1741700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1741800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1741900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1742000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1742100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1742200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1742300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 1742400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1742500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1742600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1742700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1742800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1742900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1743000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1743100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1743200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1743300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1743400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1743500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1743600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1743700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1743800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1743900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1744000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1744100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1744200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1744300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1744400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1744500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1744600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1744700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1744800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1744900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1745000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1745100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1745200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1745300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1745400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1745500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1745700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1745800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1745900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1746000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1746100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1746200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1746300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1746400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1746500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1746600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1746700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1746800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1746900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1747000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1747100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1747200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1747300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1747400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1747500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1747600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1747700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1747800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1747900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1748000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1748100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1748200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1748300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1748400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1748500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1748600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1748700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1748800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1748900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1749000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1749100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1749200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1749300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1749400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1749500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1749600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1749700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1749800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1749900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1750000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1750100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1750200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1750300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1750400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1750500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1750600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1750700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1750800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1750900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1751000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1751100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1751200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1751300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1751400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1751500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1751600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1751700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1751800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1751900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1752000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1752100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1752200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1752300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1752400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1752500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1752600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1752700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1752800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1752900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1753000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1753100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1753200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1753300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1753400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 1753500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1753600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1753700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1753800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1753900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1754000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1754100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1754200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1754300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1754400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1754500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1754600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1754700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1754800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1754900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1755000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1755100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1755200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1755300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1755400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1755500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1755600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1755700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1755800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1755900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1756000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1756100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1756200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1756300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1756400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1756500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1756600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1756700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1756800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1756900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1757000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1757100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1757200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1757300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1757400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1757500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1757600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1757700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1757800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1757900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 1758000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1758100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1758200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1758300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1758400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1758500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1758600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1758700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1758800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1758900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1759000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 1759100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1759200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 1759300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1759400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1759500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1759600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1759700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1759800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1759900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1760000 + }, + { + "epoch": 0.0, + "eval_loss": 0.029693603515625, + "eval_runtime": 3108.7118, + "eval_samples_per_second": 361.797, + "eval_steps_per_second": 22.613, + "step": 1760000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1760100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1760200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1760300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1760400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1760500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1760600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1760700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1760800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1760900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1761000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1761100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1761200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1761300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1761400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1761500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1761600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1761700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1761800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1761900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.044, + "step": 1762000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1762100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1762200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1762300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1762400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1762500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1762600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1762700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1762800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1762900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1763000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1763100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1763200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1763300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1763400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1763500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1763600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1763700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1763800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1763900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1764000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1764100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1764200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1764300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1764400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1764500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1764600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1764700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1764800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1764900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1765000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1765100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1765200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1765300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1765400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1765500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1765600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1765700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1765800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1765900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1766000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1766100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1766200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1766300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1766400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1766500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1766600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0544, + "step": 1766700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1766800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1766900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1767000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1767100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1767200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1767300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1767400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1767500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1767600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1767700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1767800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1767900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1768000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1768100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1768200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1768300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1768400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1768500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1768600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1768700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1768800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1768900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1769000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1769100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1769200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1769300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1769400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1769500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1769600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1769700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1769800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1769900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1770000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1770100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1770200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1770300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1770400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1770500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1770600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1770700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1770800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1770900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1771000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1771100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1771200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1771300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1771400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1771500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1771600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1771700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1771800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1771900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1772000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1772100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1772200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1772300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1772400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1772500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1772600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1772700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1772800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1772900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1773000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1773100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1773200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1773300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1773400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1773500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1773600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1773700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1773800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1773900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 1774000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1774100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1774200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1774300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1774400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1774500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1774600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1774700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1774800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1774900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1775000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1775100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1775200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1775300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1775400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1775500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1775600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1775700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1775800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1775900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1776000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1776100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1776200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1776300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1776400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1776500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1776600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1776700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1776800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1776900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1777000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1777100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1777200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1777300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1777400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1777500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1777600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1777700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1777800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1777900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1778000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1778100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1778200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1778300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1778400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1778500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1778600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1778700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1778800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1778900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1779000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1779100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1779200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1779300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1779400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1779500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1779600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1779700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1779800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1779900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1780000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028839111328125, + "eval_runtime": 3155.7136, + "eval_samples_per_second": 356.408, + "eval_steps_per_second": 22.276, + "step": 1780000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1780100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1780200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1780300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1780400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1780500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1780600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1780700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1780800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1780900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1781000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1781100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1781200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1781300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1781400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1781500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1781600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1781700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1781800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1781900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1782000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1782100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1782200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1782300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1782400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1782500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1782600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1782700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1782800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1782900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1783000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1783100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1783200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1783300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1783400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1783500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1783600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1783700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1783800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1783900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1784000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1784100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1784200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1784300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1784400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1784500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1784600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1784700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1784800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1784900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1785000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1785100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1785200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 1785300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1785400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1785500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1785600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1785700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1785800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1785900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1786000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1786100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1786200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1786300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1786400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1786500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1786600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1786700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1786800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1786900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 1787000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1787100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1787200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1787300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1787400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1787500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1787600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1787700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1787800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1787900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1788000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1788100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1788200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1788300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1788400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1788500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1788600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1788700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1788800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1788900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1789000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1789100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1789200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1789300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1789400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1789500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1789600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1789700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1789800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1789900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1790000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1790100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1790200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1790300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1790400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1790500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1790600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1790700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1790800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1790900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1791000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1791100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1791200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1791300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1791400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1791500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1791600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1791700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1791800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 1791900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1792000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1792100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1792200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1792300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1792400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1792500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1792600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1792700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1792800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1792900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1793000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1793100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1793200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1793300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1793400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1793500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1793600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1793700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1793800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1793900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1794000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1794100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1794200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1794300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1794400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1794500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1794600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1794700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1794800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1794900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1795000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1795100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1795200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1795300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1795400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1795500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1795600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1795700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1795800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1795900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1796000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1796100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1796200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1796300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1796400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1796500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1796600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1796700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1796800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1796900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1797000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1797100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1797200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1797300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1797400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1797500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1797600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1797700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1797800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1797900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1798000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1798100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1798200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1798300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1798400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1798500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1798600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1798700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1798800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1798900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1799000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1799100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1799200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1799300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1799400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1799500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1799600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1799700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1799800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1799900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1800000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0287017822265625, + "eval_runtime": 3102.5431, + "eval_samples_per_second": 362.516, + "eval_steps_per_second": 22.658, + "step": 1800000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1800100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1800200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1800300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1800400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1800500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1800600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1800700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1800800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1800900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1801000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1801100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1801200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1801300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1801400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1801500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1801600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1801700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1801800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1801900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1802000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1802100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1802200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1802300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1802400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1802500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1802600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1802700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1802800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1802900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1803000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1803100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1803200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1803300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1803400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1803500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1803600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1803700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1803800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1803900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1804000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1804100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1804200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1804300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1804400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1804500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1804600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1804700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1804800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1804900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1805000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1805100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1805200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1805300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1805400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1805500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1805600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1805700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1805800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1805900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1806000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1806100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1806200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1806300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1806400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1806500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1806600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1806700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1806800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1806900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1807000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1807100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1807200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1807300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1807400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1807500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1807600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1807700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1807800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1807900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1808000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1808100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1808200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1808300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1808400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1808500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1808600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1808700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1808800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1808900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1809000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1809100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1809200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1809300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1809400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1809500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1809600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1809700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1809800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1809900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1810000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1810100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1810200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1810300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1810400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1810500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1810600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1810700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1810800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1810900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1811000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1811100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1811200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1811300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1811400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1811500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1811600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1811700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1811800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1811900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1812000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1812100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1812200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1812300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1812400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1812500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1812600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1812700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1812800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1812900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1813000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1813100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1813200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1813300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1813400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1813500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1813600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1813700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1813800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1813900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1814000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1814100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1814200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1814300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1814400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1814500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1814600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1814700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1814800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1814900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1815000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1815100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1815200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1815300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1815400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1815500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1815600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1815700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1815800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1815900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1816000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1816100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1816200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1816300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1816400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1816500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1816600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1816700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1816800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1816900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1817000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1817100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1817200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1817300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1817400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1817500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1817600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1817700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1817800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1817900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1818000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1818100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1818200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1818300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1818400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1818500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1818600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1818700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1818800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1818900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1819000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 1819100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1819200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1819300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1819400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1819500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1819600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1819700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1819800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1819900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1820000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0287628173828125, + "eval_runtime": 3070.2573, + "eval_samples_per_second": 366.329, + "eval_steps_per_second": 22.896, + "step": 1820000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1820100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1820200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1820300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1820400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1820500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1820600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1820700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 1820800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1820900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1821000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1821100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1821200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1821300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1821400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1821500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1821600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1821700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1821800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1821900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1822000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1822100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1822200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1822300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1822400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1822500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1822600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1822700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1822800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1822900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1823000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1823100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1823200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1823300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1823400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1823500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1823600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1823700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1823800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1823900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1824000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1824100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1824200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1824300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1824400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1824500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1824600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1824700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1824800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1824900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1825000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1825100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1825200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1825300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1825400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1825500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1825600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1825700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1825800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1825900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0509, + "step": 1826000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 1826100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1826200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1826300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0506, + "step": 1826400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1826500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1826600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1826700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1826800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1826900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1827000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1827100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1827200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1827300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1827400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1827500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0388, + "step": 1827600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1827700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1827800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1827900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1828000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1828100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1828200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1828300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1828400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1828500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1828600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1828700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1828800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1828900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1829000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1829100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1829200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1829300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1829400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1829500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1829600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1829700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1829800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1829900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1830000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1830100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 1830200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1830300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 1830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1830500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0392, + "step": 1830600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0373, + "step": 1830700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0394, + "step": 1830800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 1830900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1831000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1831100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1831200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 1831300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1831400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1831500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1831600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 1831700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1831800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1831900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1832000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1832100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1832200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1832300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 1832400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1832500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1832600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1832700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1832800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1832900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1833000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1833100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1833200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1833300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1833400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1833500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1833600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1833700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1833800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1833900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1834000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1834100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1834200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1834300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1834400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1834500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1834600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1834700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1834800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1834900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1835000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1835100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1835200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1835300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1835400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0527, + "step": 1835500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1835600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1835700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1835800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1835900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1836000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1836100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1836200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1836300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1836400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1836500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1836600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1836700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 1836800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0617, + "step": 1836900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1837000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1837100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1837200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1837300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1837400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1837500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1837600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1837700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 1837800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1837900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 1838000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1838100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1838200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1838300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 1838400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1838500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1838600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1838700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1838800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1838900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1839000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1839100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1839200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1839300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1839400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1839500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1839600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1839700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1839800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1839900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1840000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028961181640625, + "eval_runtime": 3049.6835, + "eval_samples_per_second": 368.8, + "eval_steps_per_second": 23.05, + "step": 1840000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1840100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1840200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1840300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1840400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1840500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1840600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1840700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1840800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1840900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1841000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1841100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1841200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1841300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1841400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1841500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1841600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1841700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1841800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1841900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1842000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1842100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1842200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1842300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1842400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1842500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1842600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1842700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1842800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1842900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1843000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1843100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1843200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1843300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1843400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1843500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1843600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1843700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1843800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1843900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1844000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1844100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1844200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1844300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1844400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1844500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1844600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1844700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1844800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1844900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1845000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1845100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1845200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1845300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 1845400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1845500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1845600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1845700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1845800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1845900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1846000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1846100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1846200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1846300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1846400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1846500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1846600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1846700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1846800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1846900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1847000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1847100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1847200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1847300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1847400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1847500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1847600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1847700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1847800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1847900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1848000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1848100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1848200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1848300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1848400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1848500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1848600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1848700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1848800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1848900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1849000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1849100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1849200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1849300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1849400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1849500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1849600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1849700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1849800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1849900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1850000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1850100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1850200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1850300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1850400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1850500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1850600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1850700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1850800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1850900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1851000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1851100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1851200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1851300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1851400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1851500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1851600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1851700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 1851800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1851900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1852000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1852100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1852200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1852300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1852400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1852500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1852600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1852700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1852800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1852900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1853000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1853100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1853200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1853300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1853400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1853500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1853600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1853700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1853800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1853900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1854000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1854100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1854200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1854300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1854400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1854500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1854600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1854700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1854800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1854900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1855000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1855100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1855200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1855300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1855400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1855500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1855600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1855700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 1855800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1855900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1856000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1856100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1856200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1856300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1856400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1856500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1856600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1856700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1856800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1856900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1857000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1857100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1857200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1857300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1857400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1857500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1857600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0288, + "step": 1857700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1857800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1857900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1858000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 1858100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1858200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1858300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1858400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1858500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1858600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1858700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1858800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1858900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1859000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1859100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1859200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1859300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1859400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1859500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1859600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1859700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1859800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1859900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1860000 + }, + { + "epoch": 0.0, + "eval_loss": 0.029266357421875, + "eval_runtime": 3065.0985, + "eval_samples_per_second": 366.945, + "eval_steps_per_second": 22.934, + "step": 1860000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1860100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1860200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1860300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1860400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1860500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1860600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1860700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1860800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1860900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1861000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1861100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1861200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1861300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1861400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1861500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1861600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1861700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1861800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1861900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1862000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1862100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1862200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1862300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1862400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1862500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1862600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1862700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1862800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1862900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1863000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1863100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1863200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1863300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1863400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1863500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1863600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1863700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1863800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1863900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1864000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1864100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1864200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1864300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1864400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1864500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1864600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1864700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1864800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1864900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1865000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1865100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1865200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1865300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1865400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1865500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1865600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1865700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1865800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1865900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1866000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1866100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1866200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1866300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1866400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1866500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1866600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1866700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1866800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1866900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1867000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1867100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1867200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1867300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1867400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1867500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1867600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1867700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1867800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1867900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1868000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1868100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1868200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1868300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1868400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1868500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1868600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1868700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1868800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1868900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1869000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1869100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1869200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1869300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1869400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1869500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1869600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1869700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1869800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1869900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1870000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1870100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1870200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1870300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1870400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1870500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1870600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1870700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1870800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1870900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1871000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1871100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1871200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1871300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1871400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1871500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1871600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1871700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1871800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1871900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1872000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1872100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1872200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1872300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1872400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1872500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1872600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1872700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1872800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1872900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1873000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1873100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1873200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1873300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1873400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1873500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1873600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1873700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1873800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1873900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1874000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1874100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1874200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1874300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1874400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1874500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1874600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0397, + "step": 1874700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1874800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1874900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1875000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1875100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1875200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1875300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1875400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1875500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1875600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1875700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1875800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1875900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1876000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1876100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1876200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1876300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1876400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 1876500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1876600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1876700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1876800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1876900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 1877000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1877100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1877200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1877300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1877400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1877500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1877600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1877700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1877800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1877900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1878000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1878100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1878200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1878300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1878400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1878500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1878600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1878700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1878800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1878900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1879000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1879100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1879200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1879300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1879400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1879500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1879600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1879700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1879800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 1879900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1880000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0290069580078125, + "eval_runtime": 3078.4324, + "eval_samples_per_second": 365.356, + "eval_steps_per_second": 22.835, + "step": 1880000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1880100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1880200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1880300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1880400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1880500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1880600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1880700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1880800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1880900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1881000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1881100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1881200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1881300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1881400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1881500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1881600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1881700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1881800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1881900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1882000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1882100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1882200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1882300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1882400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1882500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1882600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1882700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1882800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1882900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1883000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1883100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1883200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1883300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1883400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1883500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1883600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1883700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1883800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1883900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1884000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1884100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1884200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1884300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1884400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1884500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1884600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1884700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1884800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1884900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1885000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1885100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1885200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1885300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1885400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1885500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1885600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1885700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0523, + "step": 1885800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1885900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1886000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1886100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1886200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1886300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1886400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1886500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1886600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1886700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1886800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1886900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1887000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1887100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1887200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1887300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1887400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1887500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1887600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1887700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1887800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1887900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1888000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1888100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1888200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1888300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 1888400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1888500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1888600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1888700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1888800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1888900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1889000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1889100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1889200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1889300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1889400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1889500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1889600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1889700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1889800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1889900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1890000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1890100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1890200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 1890300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1890400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1890500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1890600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1890700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1890800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1890900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1891000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1891100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1891200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1891300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1891400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1891500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1891600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1891700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1891800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1891900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1892000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1892100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1892200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1892300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1892400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1892500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1892600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1892700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1892800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1892900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1893000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1893100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1893200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1893300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1893400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1893500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1893600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1893700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1893800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1893900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1894000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1894100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1894200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1894300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1894400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1894500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1894600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1894700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1894800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1894900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1895000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1895100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1895200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1895300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1895400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1895500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1895600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1895700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1895800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1895900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1896000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1896100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1896200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1896300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1896400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1896500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1896600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1896700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1896800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1896900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1897000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1897100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1897200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1897300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1897400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1897500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1897600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1897700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1897800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1897900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1898000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1898100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1898200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1898300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1898400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1898500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1898600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1898700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1898800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1898900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1899000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1899100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1899200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1899300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1899400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1899500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1899600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1899700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1899800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1899900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1900000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0289764404296875, + "eval_runtime": 3119.3995, + "eval_samples_per_second": 360.558, + "eval_steps_per_second": 22.535, + "step": 1900000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1900100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1900200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1900300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1900400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1900500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1900600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1900700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1900800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1900900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1901000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1901100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1901200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1901300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1901400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1901500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1901600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1901700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1901800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1901900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1902000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1902100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1902200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1902300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1902400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1902500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1902600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1902700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1902800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1902900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1903000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1903100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1903200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1903300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1903400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1903500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1903600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1903700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1903800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1903900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1904000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1904100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1904200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 1904300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1904400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1904500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1904600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1904700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1904800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1904900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0381, + "step": 1905000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1905100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1905200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 1905300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1905400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1905500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1905600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1905700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1905800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1905900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1906000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1906100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1906200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1906300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1906400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1906500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1906600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1906700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1906800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1906900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1907000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1907100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1907200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1907300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1907400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1907500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1907600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1907700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1907800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1907900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1908000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1908100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1908200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1908300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1908400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1908500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1908600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1908700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1908800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1908900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1909000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1909100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1909200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1909300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1909400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1909500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1909600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1909700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1909800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1909900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1910000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1910100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1910200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1910300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1910400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1910500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1910600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1910700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1910800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1910900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1911000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1911100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1911200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1911300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1911400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1911500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1911600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1911700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1911800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1911900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1912000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1912100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1912200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1912300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1912400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1912500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1912600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1912700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1912800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1912900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1913000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1913100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1913200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1913300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1913400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1913500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1913600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1913700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1913800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1913900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1914000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1914100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1914200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1914300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1914400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1914500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1914600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1914700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1914800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1914900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1915000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1915100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1915200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1915300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1915400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1915500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1915600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1915700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1915800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1915900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1916000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1916100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1916200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1916300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1916400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1916500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1916600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1916700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1916800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1916900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1917000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1917100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1917200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1917300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1917400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1917500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1917600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1917700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1917800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1917900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1918000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1918100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1918200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1918300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1918400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1918500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1918600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1918700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1918800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1918900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1919000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1919100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1919200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1919300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1919400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1919500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1919600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1919700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1919800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1919900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1920000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028839111328125, + "eval_runtime": 3158.9184, + "eval_samples_per_second": 356.047, + "eval_steps_per_second": 22.253, + "step": 1920000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1920100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1920200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1920300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1920400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1920500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1920600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1920700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1920800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1920900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1921000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1921100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1921200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1921300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1921400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1921500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1921600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1921700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1921800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1921900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1922000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1922100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1922200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1922300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1922400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1922500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1922600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1922700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1922800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1922900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1923000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1923100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1923200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1923300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1923400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1923500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1923600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1923700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1923800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1923900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1924000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1924100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1924200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1924300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1924400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1924500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1924600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1924700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1924800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1924900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1925000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1925100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1925200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1925300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1925400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1925500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1925600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1925700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1925800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1925900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1926000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1926100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1926200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1926300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1926400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1926500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1926600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1926700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1926800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1926900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1927000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1927100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1927200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1927300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1927400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1927500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1927600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1927700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1927800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1927900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1928000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 1928100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1928200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1928300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1928400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1928500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1928600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1928700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1928800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1928900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1929000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1929100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1929200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1929300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1929400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1929500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1929600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1929700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1929800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1929900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1930000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1930100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1930200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1930300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1930400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1930500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 1930600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1930700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1930800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1930900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1931000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1931100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1931200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1931300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1931400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1931500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1931600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1931700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1931800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1931900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1932000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1932100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1932200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1932300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1932400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1932500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1932600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1932700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1932800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1932900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1933000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1933100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1933200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1933300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1933400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1933500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1933600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1933700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1933800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1933900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1934000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1934100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1934200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1934300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1934400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1934500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1934600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1934700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1934800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1934900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1935000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1935100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1935200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1935300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1935400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1935500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1935600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1935700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1935800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1935900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1936000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1936100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1936200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1936300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1936400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1936500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1936600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1936700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1936800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1936900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1937000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1937100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1937200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1937300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1937400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1937500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1937600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1937700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1937800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1937900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1938000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1938100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1938200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1938300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 1938400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1938500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1938600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1938700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1938800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1938900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1939000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1939100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1939200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1939300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1939400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1939500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1939600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1939700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1939800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1939900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1940000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028839111328125, + "eval_runtime": 3465.8168, + "eval_samples_per_second": 324.519, + "eval_steps_per_second": 20.283, + "step": 1940000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1940100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1940200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1940300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 1940400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1940500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1940600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1940700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1940800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1940900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1941000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0378, + "step": 1941100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1941200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1941300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1941400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1941500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1941600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1941700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1941800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1941900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1942000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1942100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1942200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1942300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1942400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1942500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1942600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1942700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1942800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1942900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1943000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1943100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1943200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1943300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1943400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1943500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1943600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1943700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1943800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1943900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1944000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1944100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1944200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1944300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1944400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1944500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1944600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1944700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1944800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1944900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1945000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1945100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1945200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1945300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1945400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1945500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1945600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1945700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1945800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1945900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1946000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1946100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1946200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1946300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1946400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1946500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1946600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1946700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1946800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1946900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1947000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1947100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1947200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1947300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1947400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1947500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1947600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1947700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1947800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1947900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1948000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1948100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1948200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1948300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1948400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1948500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1948600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1948700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1948800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1948900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1949000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1949100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1949200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1949300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1949400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1949500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1949600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1949700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1949800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1949900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1950000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1950100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1950200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1950300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1950400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1950500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 1950600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1950700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1950800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1950900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1951000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1951100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1951200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1951300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1951400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1951500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1951600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1951700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1951800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 1951900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1952000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1952100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1952200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1952300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1952400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1952500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1952600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1952700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1952800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1952900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1953000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1953100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1953200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1953300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1953400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1953500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1953600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1953700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1953800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1953900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1954000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1954100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1954200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 1954300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1954400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1954500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1954600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1954700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1954800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1954900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1955000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1955100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1955200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1955300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1955400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1955500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1955600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1955700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1955800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1955900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1956000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1956100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1956200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1956300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1956400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1956500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1956600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1956700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1956800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1956900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1957000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1957100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1957200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1957300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1957400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1957500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1957600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1957700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1957800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1957900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1958000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1958100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1958200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1958300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1958400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1958500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1958600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1958700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1958800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1958900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1959000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1959100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1959200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1959300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1959400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 1959500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1959600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1959700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1959800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1959900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1960000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0286865234375, + "eval_runtime": 3421.7973, + "eval_samples_per_second": 328.694, + "eval_steps_per_second": 20.544, + "step": 1960000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1960100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1960200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1960300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 1960400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1960500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1960600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1960700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1960800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1960900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1961000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1961100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1961200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1961300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1961400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1961500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1961600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1961700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1961800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1961900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1962000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1962100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1962200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1962300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1962400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1962500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1962600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1962700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1962800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1962900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1963000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1963100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1963200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1963300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1963400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1963500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1963600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1963700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1963800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1963900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1964000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1964100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 1964200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1964300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1964400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1964500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1964600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1964700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1964800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1964900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1965000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1965100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1965200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1965300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1965400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1965500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1965600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1965700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 1965800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1965900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1966000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1966100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1966200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1966300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1966400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1966500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1966600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1966700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1966800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1966900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1967000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1967100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1967200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1967300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1967400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1967500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1967600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1967700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1967800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1967900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1968000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 1968100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 1968200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1968300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1968400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1968500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1968600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1968700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1968800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1968900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1969000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1969100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1969200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1969300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1969400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1969500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1969600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1969700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 1969800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1969900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1970000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1970100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1970200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1970300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1970400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1970500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1970600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1970700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1970800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1970900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1971000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1971100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1971200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1971300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1971400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1971500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1971600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1971700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1971800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 1971900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1972000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1972100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1972200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1972300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1972400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1972500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1972600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1972700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1972800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1972900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1973000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1973100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1973200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1973300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1973400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1973500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1973600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1973700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1973800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1973900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1974000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1974100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1974200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1974300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1974400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1974500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1974600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1974700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 1974800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1974900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1975000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1975100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1975200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1975300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1975400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1975500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1975600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1975700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1975800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1975900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1976000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1976100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1976200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1976300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1976400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1976500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1976600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 1976700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1976800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 1976900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1977000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 1977100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1977200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1977300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1977400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1977500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1977600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1977700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1977800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1977900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1978000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1978100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1978200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1978300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1978400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1978500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1978600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1978700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1978800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1978900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1979000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1979100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1979200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1979300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1979400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1979500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1979600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1979700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1979800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1979900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 1980000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0838623046875, + "eval_runtime": 3306.5915, + "eval_samples_per_second": 340.146, + "eval_steps_per_second": 21.259, + "step": 1980000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 1980100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1980200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1980300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1980400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1980500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1980600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1980700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1980800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1980900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1981000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1981100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1981200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1981300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1981400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1981500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1981600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1981700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1981800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1981900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1982000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1982100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1982200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 1982300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1982400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 1982500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1982600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1982700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1982800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1982900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 1983000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1983100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1983200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1983300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1983400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1983500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1983600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1983700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1983800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1983900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1984000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1984100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1984200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1984300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1984400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1984500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1984600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1984700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1984800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1984900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1985000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1985100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 1985200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1985300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1985400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1985500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1985600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1985700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1985800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1985900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1986000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1986100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 1986200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1986300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1986400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1986500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1986600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1986700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1986800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1986900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1987000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1987100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1987200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1987300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1987400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1987500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1987600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1987700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1987800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1987900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1988000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 1988100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1988200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1988300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1988400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1988500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1988600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1988700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 1988800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1988900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1989000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1989100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1989200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1989300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1989400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1989500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1989600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1989700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 1989800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1989900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1990000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1990100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1990200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1990300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 1990400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 1990500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1990600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1990700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1990800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1990900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1991000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 1991100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1991200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1991300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1991400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1991500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1991600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1991700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1991800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1991900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1992000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1992100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1992200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1992300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 1992400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1992500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1992600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 1992700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1992800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1992900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1993000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1993100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1993200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 1993300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 1993400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 1993500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1993600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1993700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1993800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 1993900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 1994000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1994100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1994200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1994300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1994400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1994500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1994600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1994700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1994800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 1994900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1995000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1995100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1995200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1995300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 1995400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1995500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1995600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1995700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1995800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1995900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 1996000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1996100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1996200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1996300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1996400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1996500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1996600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 1996700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1996800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1996900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1997000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 1997100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 1997200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1997300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1997400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1997500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 1997600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 1997700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1997800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 1997900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1998000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 1998100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 1998200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 1998300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1998400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 1998500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 1998600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 1998700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 1998800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1998900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 1999000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 1999100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 1999200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 1999300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1999400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1999500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 1999600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 1999700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 1999800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 1999900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2000000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0287628173828125, + "eval_runtime": 3627.9486, + "eval_samples_per_second": 310.016, + "eval_steps_per_second": 19.376, + "step": 2000000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2000100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2000200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2000300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2000400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2000500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2000600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2000700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2000800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2000900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2001000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2001100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2001200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2001300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2001400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2001500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2001600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2001700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2001800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2001900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2002000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2002100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2002200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2002300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2002400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2002500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2002600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2002700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2002800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2002900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2003000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2003100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2003200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2003300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2003400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2003500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2003600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2003700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2003800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2003900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2004000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2004100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2004200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2004300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2004400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2004500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2004600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2004700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2004800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2004900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2005000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2005100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2005200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2005300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2005400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2005500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2005600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2005700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2005800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2005900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2006000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2006100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2006200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2006300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2006400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2006500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2006600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2006700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2006800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2006900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2007000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2007100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2007200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2007300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2007400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2007500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2007600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2007700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2007800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2007900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2008000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2008100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2008200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2008300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2008400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2008500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2008600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2008700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2008800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2008900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2009000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2009100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2009200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2009300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2009400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2009500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2009600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2009700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 2009800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2009900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2010000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2010100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2010200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2010300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2010400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2010500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2010600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2010700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2010800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2010900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2011000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2011100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2011200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2011300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2011400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2011500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2011600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2011700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2011800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2011900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2012000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2012100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2012200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2012300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2012400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2012500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2012600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2012700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2012800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2012900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2013000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2013100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2013200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2013300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2013400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2013500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2013600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2013700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2013800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2013900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2014000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2014100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2014200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2014300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2014400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2014500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2014600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2014700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2014800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2014900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2015000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2015100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2015200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2015300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2015400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2015500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2015600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2015700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2015800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2015900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2016000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2016100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 2016200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 2016300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 2016400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 2016500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 2016600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2016700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2016800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2016900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2017000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2017100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2017200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2017300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2017400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2017500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2017600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2017700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2017800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 2017900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 2018000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2018100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2018200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2018300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2018400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2018500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2018600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2018700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2018800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2018900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2019000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2019100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2019200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2019300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2019400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2019500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2019600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2019700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2019800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2019900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2020000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02899169921875, + "eval_runtime": 3078.8528, + "eval_samples_per_second": 365.306, + "eval_steps_per_second": 22.832, + "step": 2020000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2020100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 2020200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2020300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2020400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2020500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2020600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2020700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2020800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2020900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2021000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2021100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2021200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2021300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2021400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2021500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2021600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2021700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2021800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2021900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2022000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2022100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2022200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2022300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2022400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2022500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2022600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2022700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2022800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2022900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2023000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2023100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2023200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2023300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2023400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2023500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2023600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2023700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2023800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2023900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2024000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2024100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2024200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2024300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2024400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2024500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2024600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2024700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2024800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2024900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2025000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2025100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2025200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2025300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2025400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2025500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2025600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2025700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2025800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2025900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2026000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2026100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2026200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2026300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2026400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2026500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2026600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2026700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2026800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2026900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2027000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2027100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2027200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2027300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2027400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2027500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2027600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2027700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2027800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2027900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2028000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2028100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2028200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2028300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2028400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2028500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2028600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2028700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2028800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2028900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2029000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2029100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2029200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2029300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2029400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2029500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2029600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2029700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2029800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2029900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2030000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2030100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2030200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2030300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2030400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2030500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2030600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2030700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2030800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2030900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2031000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2031100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2031200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2031300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2031400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2031500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2031600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2031700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2031800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2031900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2032000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2032100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2032200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2032300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2032400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2032500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2032600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2032700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2032800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2032900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2033000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2033100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2033200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2033300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2033400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2033500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2033600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2033700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2033800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2033900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2034000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2034100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2034200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2034300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2034400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2034500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2034600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2034700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2034800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2034900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 2035000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 2035100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2035200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2035300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2035400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2035500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2035600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2035700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2035800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2035900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2036000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2036100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2036200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2036300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2036400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2036500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2036600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2036700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2036800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2036900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2037000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2037100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2037200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2037300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2037400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2037500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2037600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2037700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2037800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2037900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2038000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2038100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2038200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2038300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2038400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2038500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2038600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2038700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2038800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2038900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2039000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2039100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2039200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2039300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2039400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2039500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2039600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2039700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2039800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2039900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2040000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0289764404296875, + "eval_runtime": 3149.7392, + "eval_samples_per_second": 357.084, + "eval_steps_per_second": 22.318, + "step": 2040000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2040100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2040200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2040300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2040400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2040500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2040600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2040700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2040800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2040900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2041000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2041100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2041200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2041300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2041400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2041500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2041600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2041700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2041800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2041900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2042000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2042100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2042200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2042300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2042400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2042500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2042600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2042700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2042800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2042900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2043000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2043100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2043200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2043300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2043400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2043500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2043600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2043700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2043800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2043900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2044000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2044100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2044200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2044300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2044400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2044500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2044600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 2044700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2044800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2044900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2045000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2045100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2045200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2045300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2045400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2045500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2045600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2045700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2045800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2045900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2046000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2046100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2046200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2046300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2046400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2046500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2046600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2046700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2046800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2046900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2047000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2047100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2047200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2047300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2047400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2047500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2047600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2047700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2047800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2047900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2048000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2048100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2048200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2048300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2048400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2048500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2048600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2048700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2048800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2048900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2049000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2049100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2049200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2049300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2049400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2049500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2049600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2049700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2049800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2049900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2050000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2050100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2050200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2050300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2050400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2050500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2050600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2050700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2050800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2050900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2051000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2051100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2051200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2051300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2051400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2051500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2051600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2051700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2051800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2051900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2052000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2052100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2052200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2052300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2052400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2052500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2052600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2052700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2052800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2052900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2053000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2053100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2053200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2053300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2053400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2053500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2053600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2053700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2053800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2053900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2054000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2054100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2054200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2054300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2054400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2054500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2054600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2054700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2054800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2054900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2055000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2055100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2055200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2055300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2055400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2055500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2055600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2055700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2055800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2055900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2056000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2056100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2056200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2056300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2056400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2056500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2056600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2056700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2056800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2056900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2057000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2057100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2057200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2057300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2057400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2057500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2057600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2057700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2057800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2057900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2058000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2058100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2058200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2058300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2058400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2058500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2058600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2058700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2058800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2058900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2059000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2059100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2059200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2059300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2059400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2059500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2059600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2059700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2059800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2059900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2060000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0288543701171875, + "eval_runtime": 3840.4216, + "eval_samples_per_second": 292.864, + "eval_steps_per_second": 18.304, + "step": 2060000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2060100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2060200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2060300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2060400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2060500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2060600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2060700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2060800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2060900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2061000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2061100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2061200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2061300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2061400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2061500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2061600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2061700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2061800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2061900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2062000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2062100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2062200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2062300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2062400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2062500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2062600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2062700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2062800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 2062900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2063000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2063100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2063200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2063300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2063400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2063500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2063600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2063700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2063800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2063900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2064000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2064100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2064200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2064300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2064400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2064500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2064600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2064700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2064800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0507, + "step": 2064900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2065000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2065100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2065200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2065300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2065400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2065500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2065600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2065700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2065800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2065900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2066000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2066100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2066200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2066300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2066400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2066500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2066600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2066700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2066800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2066900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2067000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2067100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2067200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2067300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2067400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2067500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2067600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2067700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2067800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2067900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2068000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2068100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2068200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2068300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2068400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2068500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2068600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2068700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2068800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2068900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2069000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2069100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2069200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2069300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2069400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2069500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2069600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2069700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2069800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2069900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2070000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2070100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2070200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2070300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2070400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2070500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2070600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2070700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2070800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2070900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2071000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2071100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2071200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2071300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2071400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2071500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2071600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2071700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2071800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2071900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2072000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2072100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2072200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2072300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2072400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2072500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2072600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2072700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2072800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2072900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2073000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2073100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2073200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2073300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2073400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2073500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2073600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2073700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2073800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2073900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2074000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2074100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2074200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2074300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2074400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2074500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2074600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2074700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2074800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2074900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2075000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2075100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2075200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2075300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2075400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2075500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2075600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2075700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2075800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2075900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2076000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2076100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2076200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2076300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2076400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2076500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2076600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2076700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2076800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2076900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2077000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2077100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2077200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2077300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2077400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2077500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2077600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2077700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2077800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2077900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2078000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2078100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2078200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2078300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2078400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2078500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2078600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2078700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2078800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2078900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2079000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2079100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2079200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2079300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2079400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2079500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2079600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2079700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2079800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2079900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2080000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028472900390625, + "eval_runtime": 3134.6894, + "eval_samples_per_second": 358.799, + "eval_steps_per_second": 22.425, + "step": 2080000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2080100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2080200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2080300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2080400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2080500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2080600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2080700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2080800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2080900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2081000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2081100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2081200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2081300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2081400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2081500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2081600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2081700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2081800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2081900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2082000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2082100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2082200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2082300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2082400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2082500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2082600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2082700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2082800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2082900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2083000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2083100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2083200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2083300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2083400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2083500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2083600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2083700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2083800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2083900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2084000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2084100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2084200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2084300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2084400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2084500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2084600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2084700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2084800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2084900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2085000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2085100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2085200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2085300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2085400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2085500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2085600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2085700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2085800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2085900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2086000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2086100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2086200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2086300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2086400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2086500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2086600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2086700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2086800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2086900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2087000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2087100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2087200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2087300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2087400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2087500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2087600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2087700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2087800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2087900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2088000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2088100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2088200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2088300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2088400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2088500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2088600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2088700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2088800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2088900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 2089000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2089100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2089200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2089300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2089400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2089500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2089600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2089700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2089800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2089900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2090000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2090100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2090200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2090300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2090400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2090500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2090600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2090700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2090800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2090900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2091000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2091100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2091200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2091300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2091400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2091500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2091600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2091700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2091800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2091900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2092000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2092100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2092200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2092300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2092400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2092500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2092600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2092700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2092800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2092900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 2093000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2093100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2093200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2093300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2093400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2093500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2093600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2093700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2093800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2093900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2094000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2094100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2094200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2094300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2094400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2094500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2094600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2094700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2094800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2094900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2095000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2095100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2095200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2095300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 2095400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2095500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 2095600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2095700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2095800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2095900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2096000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2096100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2096200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2096300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2096400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2096500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2096600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2096700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2096800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2096900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2097000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2097100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2097200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2097300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2097400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2097500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2097600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2097700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2097800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2097900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2098000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2098100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2098200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2098300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2098400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2098500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2098600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2098700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2098800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2098900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2099000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2099100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2099200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2099300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2099400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2099500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2099600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2099700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2099800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2099900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2100000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0286865234375, + "eval_runtime": 3215.2333, + "eval_samples_per_second": 349.811, + "eval_steps_per_second": 21.863, + "step": 2100000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2100100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2100200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2100300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2100400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2100500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2100600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2100700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2100800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2100900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2101000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2101100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2101200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2101300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2101400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2101500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2101600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2101700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2101800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2101900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2102000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2102100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2102200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2102300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2102400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2102500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2102600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2102700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2102800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2102900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2103000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2103100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2103200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2103300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2103400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2103500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2103600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2103700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2103800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2103900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2104000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2104100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2104200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2104300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2104400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2104500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2104600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2104700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2104800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2104900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2105000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2105100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2105200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2105300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2105400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2105500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2105600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2105700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2105800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2105900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2106000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2106100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2106200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2106300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2106400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2106500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2106600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2106700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2106800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2106900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2107000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2107100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2107300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2107400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2107500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2107600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2107700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2107800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2107900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2108000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2108100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2108200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2108300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2108400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2108500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2108600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2108700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2108800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2108900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2109000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2109100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2109200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2109300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2109400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2109500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2109600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2109700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2109800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2109900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2110000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2110100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2110200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2110300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2110400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2110500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2110600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2110700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2110800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2110900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2111000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2111100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2111200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2111300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2111400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2111500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2111600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2111700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2111800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2111900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2112000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2112100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2112200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2112300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2112400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2112500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2112600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2112700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2112800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2112900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2113000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2113100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2113200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2113300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2113400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2113500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2113600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2113700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2113800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2113900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2114000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2114100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2114200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2114300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2114400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2114500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2114600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2114700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2114800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2114900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2115000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2115100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2115200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2115300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2115400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2115500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2115600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2115700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2115800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2115900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2116000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2116100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2116200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2116300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2116400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2116500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2116600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2116700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2116800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2116900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2117000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2117100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2117200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2117300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2117400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2117500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2117600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0286, + "step": 2117700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2117800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2117900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2118000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2118100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2118200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2118300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2118400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2118500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2118600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2118700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2118800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2118900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2119000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2119100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2119200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2119300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2119400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2119500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2119600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2119700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2119800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2119900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2120000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284881591796875, + "eval_runtime": 3185.3885, + "eval_samples_per_second": 353.088, + "eval_steps_per_second": 22.068, + "step": 2120000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2120100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2120200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2120300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2120400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2120500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2120600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2120700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2120800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2120900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2121000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2121100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2121200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2121300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2121400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2121500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2121600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2121700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2121800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2121900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2122000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2122100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2122200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2122300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2122400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2122500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2122600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2122700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2122800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2122900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2123000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2123100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2123200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2123300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2123400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2123500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2123600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2123700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2123800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2123900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2124000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2124100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2124200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2124300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2124400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2124500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2124600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2124700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2124800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2124900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2125000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2125100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2125200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2125300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2125400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2125500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2125600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2125700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2125800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2125900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2126000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2126100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2126200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2126300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2126400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2126500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2126600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2126700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2126800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2126900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2127000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2127100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2127200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2127300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2127400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2127500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2127600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2127700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2127800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2127900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2128000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2128100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2128200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2128300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2128400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2128500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2128600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2128700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2128800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2128900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2129000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2129100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2129200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2129300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2129400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2129500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2129600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2129700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2129800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2129900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2130000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2130100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2130200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2130300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2130400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2130500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2130600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2130700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2130800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2130900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2131000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2131100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2131200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2131300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2131400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2131500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2131600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2131700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2131800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2131900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2132000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2132100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2132200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2132300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2132400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2132500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2132600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2132700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2132800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2132900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2133000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2133100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2133200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2133300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2133400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2133500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2133600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2133700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2133800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2133900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2134000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2134100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2134200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2134300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2134400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2134500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2134600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2134700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2134800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2134900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2135000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2135100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2135200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2135300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2135400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2135500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2135600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2135700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2135800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2135900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2136000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2136100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2136200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2136300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2136400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2136500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2136600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2136700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2136800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2136900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2137000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2137100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2137200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2137300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2137400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2137500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2137600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2137700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2137800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2137900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2138000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2138100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2138200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2138300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2138400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2138500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2138600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2138700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2138800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2138900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2139000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2139100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2139200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2139300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2139400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2139500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2139600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2139700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2139800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2139900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2140000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284576416015625, + "eval_runtime": 3176.7209, + "eval_samples_per_second": 354.052, + "eval_steps_per_second": 22.128, + "step": 2140000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2140100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2140200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2140300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2140400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2140500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2140600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2140700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2140800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2140900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2141000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2141100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2141200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2141300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2141400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2141500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2141600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2141700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2141800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2141900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2142000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2142100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2142200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2142300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2142400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2142500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2142600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2142700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2142800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2142900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2143000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2143100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2143200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2143300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2143400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2143500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2143600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2143700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2143800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2143900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2144000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2144100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2144200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2144300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2144400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2144500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2144600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2144700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2144800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2144900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2145000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2145100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2145200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2145300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2145400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2145500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2145600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2145700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2145800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2145900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2146000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2146100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2146200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2146300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2146400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2146500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2146600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2146700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2146800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2146900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2147000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2147100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2147200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2147300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2147400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2147500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2147600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2147700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2147800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2147900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2148000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2148100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2148200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2148300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2148400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2148500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2148600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2148700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2148800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2148900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2149000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2149100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2149200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2149300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2149400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2149500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2149600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2149700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2149800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2149900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2150000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2150100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2150200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2150300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2150400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2150500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2150600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2150700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2150800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2150900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2151000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2151100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2151200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2151300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2151400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2151500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2151600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2151700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2151800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2151900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2152000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2152100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2152200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2152300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2152400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2152500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2152600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2152700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2152800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2152900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2153000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2153100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2153200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2153300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2153400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2153500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2153600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2153700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2153800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2153900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2154000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2154100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2154200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2154300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2154400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2154500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2154600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2154700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2154800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2154900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2155000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2155100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2155200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2155300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2155400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2155500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2155600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2155700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2155800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2155900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2156000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2156100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2156200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2156300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2156400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2156500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2156600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2156700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2156800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2156900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2157000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2157100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2157200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2157300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2157400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2157500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2157600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2157700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2157800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2157900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2158000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2158100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2158200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2158300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2158400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2158500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2158600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2158700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2158800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2158900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2159000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2159100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2159200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2159300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2159400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2159500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2159600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2159700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2159800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2159900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2160000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028472900390625, + "eval_runtime": 3202.5066, + "eval_samples_per_second": 351.201, + "eval_steps_per_second": 21.95, + "step": 2160000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2160100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2160200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2160300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2160400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2160500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2160600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2160700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2160800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2160900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2161000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2161100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2161200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2161300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2161400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2161500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2161600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2161700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2161800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2161900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2162000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2162100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2162200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0363, + "step": 2162300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2162400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2162500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2162600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2162700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2162800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2162900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2163000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2163100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2163200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2163300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2163400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2163500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 2163600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2163700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2163800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 2163900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2164000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2164100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2164200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2164300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2164400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2164500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2164600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2164700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 2164800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 2164900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2165000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2165100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0413, + "step": 2165200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2165300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2165400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2165500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2165600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2165700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2165800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2165900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2166000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2166100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 2166200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2166300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2166400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2166500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2166600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2166700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2166800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2166900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2167000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2167100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2167200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2167300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2167400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2167500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2167600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2167700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2167800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2167900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2168000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2168100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2168200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2168300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2168400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2168500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2168600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2168700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2168800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2168900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2169000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2169100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2169200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2169300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2169400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2169500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2169600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2169700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2169800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2169900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2170000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2170100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2170200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2170300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2170400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2170500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2170600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2170700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 2170800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 2170900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2171000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2171100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2171200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2171300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2171400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2171500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2171600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2171700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2171800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2171900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2172000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2172100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2172200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2172300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2172400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2172500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2172600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2172700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2172800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2172900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2173000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2173100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2173200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2173300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2173400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2173500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2173600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2173700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2173800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2173900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2174000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2174100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2174200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2174300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2174400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2174500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2174600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2174700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2174800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2174900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2175000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2175100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2175200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2175300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2175400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2175500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2175600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2175700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2175800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2175900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2176000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2176100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2176200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2176300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2176400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2176500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2176600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2176700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2176800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2176900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2177000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2177100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2177200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2177300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2177400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2177500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2177600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2177700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2177800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2177900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2178000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2178100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2178200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2178300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2178400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 2178500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2178600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2178700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2178800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2178900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2179000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2179100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2179200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2179300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2179400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2179500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2179600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2179700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2179800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2179900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2180000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028778076171875, + "eval_runtime": 3226.9277, + "eval_samples_per_second": 348.543, + "eval_steps_per_second": 21.784, + "step": 2180000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2180100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2180200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2180300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2180400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2180500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2180600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2180700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2180800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2180900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2181000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2181100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2181200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2181300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2181400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2181500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2181600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2181700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2181800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2181900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2182000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2182100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2182200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2182300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2182400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2182500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2182600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2182700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2182800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2182900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2183000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2183100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2183200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2183300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2183400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2183500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2183600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2183700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2183800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2183900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2184000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2184100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2184200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2184300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2184400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 2184500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2184600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2184700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2184800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2184900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2185000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2185100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2185200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2185300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2185400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2185500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2185600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2185700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2185800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2185900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2186000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2186100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2186200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2186300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2186400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2186500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2186600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2186700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2186800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2186900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2187000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2187100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2187200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2187300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2187400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2187500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2187600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2187700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2187800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2187900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2188000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2188100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2188200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2188300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2188400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2188500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2188600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2188700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2188800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2188900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2189000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2189100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2189200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2189300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2189400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2189500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2189600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2189700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2189800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2189900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2190000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2190100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2190200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2190300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2190400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2190500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2190600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2190700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2190800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2190900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2191000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2191100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2191200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2191300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0519, + "step": 2191400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0475, + "step": 2191500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2191600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2191700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2191800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2191900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2192100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2192200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2192300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2192400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0358, + "step": 2192500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2192600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2192700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2192800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2192900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2193000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2193100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2193200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2193300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2193400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2193500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2193600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2193700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2193800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2193900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2194000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 2194100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 2194200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2194300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2194400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2194500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2194600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2194700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2194800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2194900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2195000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2195100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2195200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2195300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2195400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2195500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2195600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2195700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2195800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2195900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2196000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2196100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2196200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2196300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2196400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2196500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2196600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2196700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2196800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2196900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2197000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2197100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2197200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2197300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2197400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2197500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2197600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2197700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2197800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2197900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2198000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2198100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2198200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2198300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2198400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2198500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2198600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2198700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2198800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2198900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2199000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2199100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2199200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2199300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2199400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2199500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2199600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2199700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2199800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2199900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2200000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284271240234375, + "eval_runtime": 2908.5149, + "eval_samples_per_second": 386.7, + "eval_steps_per_second": 24.169, + "step": 2200000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2200100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2200200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2200300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2200400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2200500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2200600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2200700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2200800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2200900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2201000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2201100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2201200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2201300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0369, + "step": 2201400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2201500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2201600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2201700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2201800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2201900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2202000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2202100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2202200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2202300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2202400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2202500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2202600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2202700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2202800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2202900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2203000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2203100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2203200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2203300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2203400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2203500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2203600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2203700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2203800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2203900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2204000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2204100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2204200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2204300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2204400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2204500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2204600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2204700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2204800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2204900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2205000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2205100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2205200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2205300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2205400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2205500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2205600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2205700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2205800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2205900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2206000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2206100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2206200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2206300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2206400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2206500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2206600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2206700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 2206800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2206900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2207000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2207100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2207200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2207300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2207400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2207500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2207600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2207700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2207800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2207900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2208000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2208100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2208200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2208300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2208400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2208500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2208600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2208700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2208800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2208900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2209000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2209100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2209200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2209300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2209400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2209500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2209600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2209700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2209800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2209900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2210000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2210100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2210200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2210300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2210400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2210500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2210600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2210700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2210800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2210900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2211000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2211100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2211200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2211300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2211400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2211500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2211600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2211700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2211800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2211900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2212000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2212100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2212200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2212300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2212400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2212500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2212600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2212700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2212800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2212900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2213000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2213100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2213200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2213300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2213400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2213500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2213600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2213700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2213800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2213900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2214000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2214100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2214200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2214300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2214400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2214500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2214600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2214700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2214800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2214900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2215000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2215100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2215200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2215300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2215400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2215500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2215600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2215700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2215800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2215900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2216000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2216100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2216200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2216300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2216400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2216500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2216600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2216700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2216800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2216900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2217000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2217100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2217200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2217300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2217400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2217500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2217600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2217700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2217800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2217900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2218000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2218100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2218200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2218300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2218400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2218500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2218600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2218700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2218800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2218900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2219000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2219100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2219200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2219300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2219400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2219500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2219600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2219700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2219800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2219900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2220000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283355712890625, + "eval_runtime": 3375.4901, + "eval_samples_per_second": 333.203, + "eval_steps_per_second": 20.825, + "step": 2220000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2220100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2220200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2220300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2220400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2220500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2220600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2220700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2220800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2220900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2221000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2221100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2221200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2221300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2221400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2221500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2221600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2221700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2221800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2221900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2222000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2222100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2222200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2222300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2222400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2222500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2222600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2222700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2222800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2222900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2223000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2223100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2223200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2223300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2223400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2223500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2223600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2223700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2223800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2223900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2224000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2224100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2224200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2224300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2224400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2224500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2224600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2224700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2224800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2224900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2225000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2225100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2225200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2225300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2225400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2225500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2225600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2225700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2225800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2225900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2226000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2226100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2226200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2226300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2226400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2226500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2226600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2226700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2226800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2226900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2227000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2227100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2227200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2227300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2227400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2227500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2227600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2227700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2227800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2227900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2228000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2228100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2228200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2228300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2228400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2228500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2228600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2228700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2228800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2228900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2229000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2229100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2229200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2229300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2229400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2229500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2229600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2229700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2229800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2229900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2230000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2230100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2230200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2230300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2230400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2230500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2230600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2230700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2230800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2230900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2231000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2231100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2231200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2231300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2231400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2231500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2231600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2231700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2231800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2231900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2232000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2232100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2232200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2232300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2232400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2232500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2232600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2232700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2232800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2232900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2233000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2233100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2233200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2233300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2233400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2233500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2233600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2233700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2233800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2233900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2234000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2234100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2234200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2234300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2234400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2234500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2234600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2234700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2234800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2234900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2235000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2235100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2235200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2235300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2235400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2235500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2235600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2235700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2235800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2235900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2236000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2236100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2236200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2236300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2236400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2236500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2236600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2236700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2236800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2236900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2237000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2237100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2237200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2237300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2237400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2237500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2237600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2237700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2237800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2237900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2238000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2238100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2238200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2238300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2238400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2238500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2238600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2238700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2238800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2238900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2239000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2239100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2239200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2239300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2239400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2239500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2239600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2239700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2239800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2239900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2240000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02838134765625, + "eval_runtime": 3108.7086, + "eval_samples_per_second": 361.797, + "eval_steps_per_second": 22.613, + "step": 2240000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2240100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2240200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2240300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2240400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2240500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2240600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2240700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2240800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2240900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2241000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2241100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2241200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2241300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2241400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2241500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2241600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2241700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2241800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2241900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2242000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2242100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2242200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2242300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2242400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2242500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0755, + "step": 2242600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2242700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2242800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2242900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2243000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2243100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2243200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2243300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2243400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2243500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2243600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2243700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2243800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2243900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2244000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2244100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2244200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2244300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2244400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2244500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2244600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2244700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2244800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0573, + "step": 2244900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2245000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2245100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2245200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2245300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2245400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2245500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2245600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2245700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2245800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2245900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2246000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2246100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2246200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2246300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2246400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2246500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2246600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2246700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2246800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2246900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2247000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2247100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2247200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2247300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2247400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2247500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2247600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2247700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2247800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2247900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2248000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2248100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2248200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2248300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2248400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2248500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2248600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2248700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2248800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2248900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2249000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2249100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2249200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2249300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2249400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2249500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2249600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2249700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2249800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2249900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2250000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2250100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2250200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2250300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2250400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2250500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2250600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2250700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2250800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2250900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2251000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2251100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2251200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2251300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2251400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2251500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2251600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2251700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2251800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2251900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2252000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2252100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2252200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2252300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2252400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2252500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2252600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2252700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2252800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2252900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2253000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2253100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2253200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2253300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2253400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2253500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2253600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2253700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2253800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2253900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2254000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2254100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2254200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2254300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2254400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2254500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2254600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2254700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2254800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2254900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2255000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2255100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2255200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2255300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2255400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2255500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2255600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2255700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2255800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2255900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2256000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2256100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2256200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2256300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2256400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2256500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2256600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2256700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2256800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2256900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2257000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2257100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2257200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2257300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2257400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2257500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2257600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2257700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2257800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2257900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2258000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2258100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2258200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2258300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2258400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2258500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2258600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2258700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2258800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2258900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2259000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2259100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2259200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2259300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2259400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2259500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2259600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2259700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2259800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2259900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2260000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0285491943359375, + "eval_runtime": 3338.9852, + "eval_samples_per_second": 336.846, + "eval_steps_per_second": 21.053, + "step": 2260000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2260100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2260200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2260300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2260400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2260500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2260600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2260700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2260800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2260900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2261000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2261100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2261200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2261300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2261400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2261500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2261600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2261700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2261800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2261900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2262000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2262100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2262200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2262300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2262400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2262500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2262600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2262700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2262800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2262900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2263000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2263100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2263200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2263300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2263400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2263500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2263600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2263700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2263800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2263900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2264000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2264100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2264200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2264300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2264400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2264500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2264600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2264700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2264800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2264900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2265000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2265100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2265200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2265300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2265400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2265500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2265600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2265700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2265800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2265900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2266000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2266100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2266200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2266300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2266400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2266500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2266600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2266700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2266800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2266900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2267000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2267100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2267200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2267300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2267400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2267500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2267600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2267700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2267800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2267900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2268000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2268100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2268200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2268300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2268400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2268500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2268600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2268700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2268800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2268900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2269000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2269100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2269200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2269300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2269400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2269500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2269600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2269700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2269800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2269900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2270000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2270100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2270200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2270300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2270400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2270500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2270600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2270700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2270800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2270900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2271000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2271100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2271200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2271300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2271400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2271500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2271600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2271700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2271800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2271900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2272000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2272100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2272200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2272300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2272400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2272500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2272600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2272700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2272800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2272900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2273000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2273100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2273200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2273300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2273400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2273500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2273600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2273700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2273800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2273900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2274000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2274100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2274200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 2274300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 2274400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2274500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2274600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2274700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2274800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2274900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2275000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2275100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2275200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2275300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2275400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2275500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2275600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2275700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2275800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2275900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2276000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2276100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2276200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2276300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2276400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2276500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2276600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2276700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2276800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2276900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 2277000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2277100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2277200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2277300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2277400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2277500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2277600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2277700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2277800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2277900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2278000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2278100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2278200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2278300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2278400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2278500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2278600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2278700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2278800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2278900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2279000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2279100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2279200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2279300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2279400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2279500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2279600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2279700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2279800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2279900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2280000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028411865234375, + "eval_runtime": 3129.8443, + "eval_samples_per_second": 359.354, + "eval_steps_per_second": 22.46, + "step": 2280000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2280100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2280200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2280300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2280400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2280500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2280600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2280700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2280800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2280900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2281000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2281100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2281200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2281300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2281400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2281500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2281600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2281700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2281800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2281900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2282000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2282100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2282200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2282300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2282400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2282500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2282600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2282700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2282800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2282900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2283000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2283100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2283200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2283300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2283400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2283500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2283600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2283700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2283800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2283900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2284000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2284100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2284200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2284300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2284400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2284500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2284600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2284700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2284800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2284900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2285000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2285100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2285200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2285300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2285400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2285500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2285600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2285700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2285800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2285900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2286000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2286100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2286200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2286300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2286400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2286500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2286600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2286700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2286800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2286900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2287000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2287100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2287200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0287, + "step": 2287300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2287400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2287500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2287600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2287700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2287800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2287900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 2288000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2288100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2288200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2288300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2288400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2288500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2288600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2288700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2288800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2288900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2289000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2289100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2289200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2289300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2289400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2289500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2289600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2289700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2289800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2289900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2290000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2290100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2290200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2290300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2290400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2290500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2290600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2290700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2290800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2290900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2291000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2291100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2291200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2291300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2291400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2291500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2291600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2291700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2291800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2291900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2292000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2292100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2292200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2292300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2292400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2292500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2292600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2292700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2292800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2292900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2293000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2293100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2293200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2293300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2293400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2293500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2293600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2293700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2293800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2293900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2294000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2294100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2294200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2294300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2294400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2294500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2294600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2294700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2294800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2294900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2295000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2295100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2295200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2295300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2295400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2295500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2295600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2295700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2295800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2295900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2296000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2296100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2296200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2296300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2296400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2296500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2296600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2296700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2296800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2296900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2297000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2297100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2297200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2297300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2297400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2297500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2297600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2297700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2297800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2297900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2298000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2298100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2298200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2298300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2298400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2298500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2298600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2298700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2298800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2298900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2299000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2299100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2299200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2299300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2299400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2299500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2299600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2299700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2299800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2299900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2300000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0286712646484375, + "eval_runtime": 3485.3801, + "eval_samples_per_second": 322.697, + "eval_steps_per_second": 20.169, + "step": 2300000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2300100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2300200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2300300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2300400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2300500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2300600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2300700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2300800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2300900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2301000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2301100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2301200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2301300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2301400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2301500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2301600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2301700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2301800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2301900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2302000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2302100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2302200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 2302300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2302400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2302500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2302600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2302700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2302800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2302900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2303000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2303100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2303200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2303300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2303400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2303500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2303600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2303700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2303800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2303900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2304000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2304100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2304200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2304300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2304400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2304500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2304600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2304700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2304800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2304900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2305000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2305100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2305200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2305300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2305400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2305500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2305600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2305700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2305800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2305900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2306000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2306100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2306200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2306300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2306400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2306500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2306600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2306700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2306800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2306900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2307000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2307100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0444, + "step": 2307200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2307300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2307400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2307500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2307600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2307700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2307800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2307900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2308000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2308100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2308200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2308300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2308400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2308500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2308600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2308700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2308800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2308900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2309000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2309100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2309200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2309300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2309400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2309500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2309600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2309700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2309800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2309900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2310000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2310100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2310200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2310300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 2310400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2310500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2310600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2310700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2310800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2310900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2311000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2311100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2311200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2311300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2311400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2311500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2311600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2311700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2311800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2311900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2312000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 2312100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2312200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2312300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2312400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2312500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2312600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2312700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2312800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2312900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2313000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2313100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2313200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2313300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2313400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2313500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2313600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2313700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2313800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2313900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2314000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2314100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2314200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2314300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2314400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2314500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2314600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2314700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2314800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2314900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2315000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2315100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2315200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2315300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2315400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2315500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2315600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2315700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2315800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2315900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2316000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2316100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2316200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2316300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2316400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2316500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2316600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2316700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2316800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2316900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2317000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2317100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2317200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2317300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2317400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2317500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2317600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2317700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2317800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2317900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2318000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2318100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2318200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2318300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2318400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2318500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2318600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2318700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2318800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2318900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2319000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2319100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2319200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2319300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2319400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 2319500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2319600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2319700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2319800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2319900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2320000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284271240234375, + "eval_runtime": 3334.3289, + "eval_samples_per_second": 337.316, + "eval_steps_per_second": 21.083, + "step": 2320000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2320100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2320200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2320300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2320400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2320500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2320600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2320700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2320800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2320900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2321000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2321100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2321200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2321300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2321400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2321500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2321600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2321700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2321800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2321900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2322000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2322100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2322200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2322300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2322400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2322500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2322600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2322700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2322800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2322900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2323000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2323100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2323200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2323300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2323400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2323500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2323600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2323700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2323800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2323900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2324000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2324100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2324200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2324300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2324400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2324500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2324600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2324700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2324800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2324900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2325000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2325100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2325200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2325300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2325400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2325500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2325600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2325700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2325800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2325900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2326000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2326100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2326200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2326300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2326400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2326500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2326600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2326700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2326800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2326900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2327000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2327100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2327200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2327300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2327400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2327500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2327600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2327700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2327800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2327900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2328000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2328100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2328200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2328300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2328400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2328500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2328600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2328700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2328800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2328900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2329000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2329100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2329200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2329300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2329400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2329500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2329600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2329700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2329800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2329900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2330000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2330100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2330200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2330300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2330400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2330500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2330600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2330700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2330800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2330900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2331000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2331100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2331200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2331300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2331400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2331500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2331600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2331700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2331800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2331900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2332000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2332100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2332200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2332300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2332400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2332500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2332600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2332700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2332800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2332900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2333000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2333100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2333200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2333300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2333400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2333500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2333600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2333700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2333800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2333900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2334000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2334100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2334200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2334300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2334400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2334500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2334600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2334700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2334800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2334900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2335000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2335100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2335200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2335300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2335400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2335500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2335600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2335700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2335800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2335900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2336000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2336100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2336200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2336300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2336400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2336500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2336600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2336700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2336800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2336900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2337000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2337100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2337200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2337300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2337400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2337500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0286, + "step": 2337600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2337700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2337800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2337900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2338000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2338100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2338200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2338300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2338400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2338500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2338600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2338700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2338800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2338900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2339000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2339100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2339200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2339300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2339400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2339500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2339600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2339700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2339800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2339900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2340000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283966064453125, + "eval_runtime": 3291.434, + "eval_samples_per_second": 341.712, + "eval_steps_per_second": 21.357, + "step": 2340000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2340100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2340200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2340300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2340400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2340500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2340600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2340700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2340800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2340900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2341000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2341100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2341200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2341300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2341400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2341500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2341600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2341700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2341800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2341900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2342000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2342100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2342200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2342300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2342400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2342500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2342600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2342700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2342800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2342900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2343000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2343100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2343200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2343300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2343400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2343500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2343600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2343700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2343800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2343900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0365, + "step": 2344000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2344100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2344200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2344300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2344400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2344500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2344600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2344700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2344800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2344900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2345000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2345100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2345200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2345300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2345400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2345500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2345600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2345700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2345800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2345900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2346000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2346100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2346200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2346300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2346400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2346500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2346600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2346700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2346800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2346900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2347000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2347100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2347200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2347300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2347400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2347500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2347600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2347700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2347800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2347900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2348000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2348100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2348200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2348300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2348400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2348500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2348600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2348700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2348800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2348900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2349000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2349100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2349200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2349300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2349400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2349500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2349600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2349700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2349800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2349900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2350000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2350100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2350200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2350300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2350400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2350500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2350600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2350700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2350800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2350900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2351000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2351100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2351200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2351300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2351400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2351500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2351600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2351700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2351800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2351900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2352000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2352100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2352200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2352300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2352400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2352500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2352600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2352700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2352800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2352900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2353000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2353100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2353200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2353300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2353400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2353500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2353600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2353700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2353800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2353900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2354000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2354100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2354200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2354300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2354400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2354500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2354600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2354700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2354800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2354900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2355000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2355100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2355200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0791, + "step": 2355300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2355400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2355500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2355600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2355700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2355800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2355900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2356000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2356100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2356200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2356300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2356400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2356500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2356600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2356700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2356800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2356900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2357000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2357100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2357200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2357300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2357400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2357500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2357600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2357700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2357800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2357900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2358000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2358100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2358200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2358300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2358400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2358500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2358600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2358700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2358800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2358900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2359000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2359100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2359200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2359300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2359400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2359500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2359600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2359700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2359800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2359900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2360000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284271240234375, + "eval_runtime": 3552.3735, + "eval_samples_per_second": 316.612, + "eval_steps_per_second": 19.788, + "step": 2360000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2360100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2360200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2360300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2360400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2360500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2360600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2360700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2360800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2360900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2361000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2361100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2361200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2361300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2361400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2361500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2361600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2361700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2361800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2361900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2362000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2362100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2362200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2362300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2362400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2362500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2362600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2362700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2362800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2362900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2363000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2363100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2363200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2363300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2363400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2363500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2363600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2363700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2363800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2363900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2364000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2364100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2364200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2364300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2364400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2364500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2364600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2364700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2364800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2364900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2365000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2365100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2365200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 2365300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0374, + "step": 2365400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2365500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2365600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2365700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2365800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2365900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2366000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2366100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2366200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2366300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2366400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2366500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2366600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2366700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2366800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2366900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2367000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2367100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2367200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2367300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2367400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2367500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2367600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2367700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2367800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2367900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2368000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2368100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2368200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2368300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2368400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2368500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2368600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2368700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2368800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2368900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2369000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2369100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2369200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2369300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2369400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2369500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2369600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2369700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2369800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2369900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2370000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2370100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2370200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2370300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2370400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2370500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2370600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2370700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2370800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2370900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2371000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2371100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2371200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2371300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2371400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2371500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2371600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2371700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2371800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2371900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2372000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2372100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2372200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2372300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2372400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2372500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2372600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2372700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2372800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2372900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2373000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2373100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2373200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2373300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2373400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2373500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2373600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2373700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2373800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2373900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2374000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2374100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2374200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2374300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2374400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2374500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2374600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2374700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2374800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2374900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2375000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2375100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0356, + "step": 2375200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2375300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2375400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2375500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2375600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2375700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2375800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2375900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2376000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2376100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2376200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2376300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2376400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2376500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2376600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2376700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2376800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2376900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2377000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2377100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2377200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2377300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2377400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2377500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2377600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2377700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2377800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2377900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2378000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2378100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2378200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2378300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2378400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2378500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2378600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2378700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2378800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2378900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2379000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2379100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2379200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2379300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2379400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2379500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2379600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2379700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2379800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2379900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2380000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028656005859375, + "eval_runtime": 3360.1297, + "eval_samples_per_second": 334.726, + "eval_steps_per_second": 20.921, + "step": 2380000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2380100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2380200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2380300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2380400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2380500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2380600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 2380700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2380800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2380900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2381000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2381100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2381200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2381300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2381400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2381500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2381600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2381700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2381800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2381900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2382000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2382100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2382200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2382300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2382400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2382500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2382600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2382700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2382800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2382900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2383000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2383100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2383200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2383300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2383400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2383500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2383600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2383700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2383800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2383900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2384100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2384200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2384300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2384400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2384500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2384600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2384700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2384800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2384900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2385000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2385100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2385200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2385300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2385400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2385500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2385600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2385700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2385800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2385900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2386000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2386100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2386200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2386300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2386400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2386500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2386600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2386700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2386800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2386900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2387000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2387100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2387200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2387300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2387400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2387500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2387600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2387700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2387800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2387900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2388000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2388100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2388200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2388300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2388400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2388500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2388600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2388700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2388800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2388900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2389000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2389100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2389200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2389300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2389400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2389500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2389600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2389700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2389800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2389900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2390000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2390100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2390200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2390300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2390400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2390500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2390600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2390700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2390800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2390900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2391000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2391100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2391200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2391300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2391400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2391500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2391600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2391700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2391800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2391900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2392000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2392100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2392200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2392300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2392400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2392500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2392600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2392700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2392800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2392900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2393000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2393100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2393200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2393300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2393400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2393500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2393600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2393700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2393800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2393900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2394000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2394100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2394200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2394300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 2394400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2394500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2394600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2394700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2394800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2394900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2395000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2395100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2395200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2395300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2395400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2395500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2395600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2395700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2395800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2395900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2396000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2396100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2396200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2396300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2396400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2396500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2396600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2396700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2396800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2396900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2397000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2397100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2397200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2397300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2397400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2397500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2397600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2397700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2397800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2397900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2398000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2398100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2398200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2398300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2398400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2398500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2398600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2398700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2398800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2398900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2399000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2399100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2399200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2399300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2399400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2399500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2399600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2399700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2399800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2399900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2400000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284881591796875, + "eval_runtime": 3271.9298, + "eval_samples_per_second": 343.749, + "eval_steps_per_second": 21.485, + "step": 2400000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2400100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2400200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2400300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2400400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2400500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2400600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2400700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2400800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2400900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2401000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2401100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2401200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2401300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2401400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2401500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2401600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2401700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2401800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2401900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2402000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2402100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2402200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2402300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2402400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2402500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2402600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2402700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2402800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2402900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2403000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2403100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2403200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2403300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2403400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2403500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2403600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2403700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2403800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2403900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0287, + "step": 2404000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2404100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2404200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2404300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2404400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2404500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2404600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2404700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2404800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2404900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2405000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2405100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2405200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2405300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2405400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2405500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2405600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2405700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2405800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2405900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2406000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2406100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2406200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2406300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2406400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2406500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2406600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2406700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2406800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2406900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2407000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2407100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2407200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2407300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2407400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2407500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2407600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2407700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2407800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2407900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2408000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2408100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2408200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2408300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2408400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2408500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2408600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2408700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2408800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2408900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2409000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2409100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2409200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2409300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2409400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2409500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2409600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2409700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2409800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2409900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2410000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2410100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2410200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2410300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2410400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2410500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2410600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2410700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2410800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2410900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2411000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2411100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2411200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2411300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2411400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2411500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2411600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2411700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2411800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2411900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2412000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2412100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2412200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2412300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2412400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2412500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2412600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2412700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2412800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2412900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2413000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2413100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2413200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2413300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2413400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2413500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2413600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2413700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2413800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2413900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2414000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2414100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2414200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2414300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2414400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2414500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2414600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2414700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2414800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2414900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2415000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2415100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2415200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2415300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2415400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2415500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2415600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2415700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2415800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2415900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2416000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2416100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2416200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2416300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2416400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2416500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2416600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2416700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2416800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2416900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2417000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2417100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2417200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2417300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2417400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2417500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2417600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2417700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2417800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2417900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2418000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2418100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2418200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2418300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2418400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2418500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2418600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2418700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2418800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2418900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2419000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2419100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2419200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2419300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2419400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2419500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2419600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2419700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2419800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2419900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2420000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0282440185546875, + "eval_runtime": 3824.0528, + "eval_samples_per_second": 294.118, + "eval_steps_per_second": 18.383, + "step": 2420000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2420100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2420200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2420300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2420400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2420500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2420600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2420700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2420800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2420900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2421000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2421100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2421200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2421300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2421400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2421500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2421600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2421700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2421800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2421900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2422000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2422100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2422200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2422300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2422400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2422500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2422600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2422700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2422800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2422900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2423000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2423100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2423200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2423300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2423400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2423500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2423600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2423700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2423800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2423900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2424000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2424100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2424200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2424300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2424400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2424500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2424600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2424700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2424800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2424900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2425000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2425100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2425200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2425300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2425400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2425500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2425600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2425700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2425800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2425900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2426000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2426100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2426200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2426300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2426400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2426500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2426600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2426700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2426800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2426900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2427000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2427100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2427200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2427300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2427400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2427500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2427600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2427700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2427800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 2427900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2428000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2428100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2428200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2428300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2428400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2428500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2428600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2428700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2428800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2428900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2429000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2429100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2429200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2429300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2429400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2429500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2429600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2429700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2429800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2429900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2430000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2430100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 2430200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2430300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2430400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2430500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2430600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2430700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2430800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2430900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2431000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2431100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2431200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2431300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2431400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2431500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2431600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2431700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2431800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2431900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2432000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2432100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2432200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2432300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2432400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2432500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2432600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2432700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2432800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2432900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2433000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2433100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2433200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2433300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2433400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2433500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2433600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2433700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2433800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2433900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2434000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2434100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2434200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2434300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2434400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2434500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2434600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2434700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2434800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2434900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2435000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2435100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2435200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2435300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2435400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2435500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2435600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2435700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2435800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2435900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2436000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2436100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2436200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2436300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2436400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2436500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2436600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2436700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2436800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2436900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2437000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2437100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2437200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2437300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2437400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2437500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2437600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2437700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2437800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2437900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2438000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2438100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2438200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2438300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0288, + "step": 2438400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2438500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2438600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2438700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2438800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2438900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2439000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2439100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2439200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2439300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2439400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2439500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2439600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2439700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2439800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2439900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2440000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283966064453125, + "eval_runtime": 3407.4332, + "eval_samples_per_second": 330.079, + "eval_steps_per_second": 20.63, + "step": 2440000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2440100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2440200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2440300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 2440400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2440500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2440600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2440700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2440800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2440900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2441000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2441100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2441200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2441300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2441400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2441500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2441600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2441700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2441800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2441900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2442000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2442100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2442200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2442300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2442400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2442500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2442600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2442700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2442800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2442900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2443000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2443100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2443200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2443300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2443400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2443500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2443600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2443700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2443800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2443900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2444000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2444100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2444200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2444300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2444400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2444500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2444600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2444700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2444800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2444900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2445000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2445100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2445200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2445300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2445400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2445500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2445600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2445700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2445800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2445900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2446000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2446100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2446200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2446300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2446400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2446500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2446600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2446700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2446800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2446900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2447000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2447100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2447200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2447300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2447400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2447500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2447600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2447700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2447800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2447900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2448000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2448100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2448200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2448300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2448400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2448500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2448600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2448700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2448800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2448900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2449000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2449100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2449200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2449300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2449400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2449500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2449600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2449700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2449800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2449900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2450000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2450100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2450200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2450300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2450400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2450500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2450600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2450700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2450800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2450900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2451000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2451100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2451200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2451300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2451400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2451500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2451600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2451700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2451800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2451900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2452000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2452100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2452200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2452300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2452400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2452500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2452600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2452700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2452800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2452900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2453000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2453100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2453200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2453300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0286, + "step": 2453400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2453500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2453600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2453700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2453800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2453900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2454000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2454100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2454200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2454300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2454400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2454500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2454600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2454700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2454800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2454900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2455000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2455100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2455200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2455300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2455400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2455500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2455600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2455700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2455800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2455900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2456000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2456100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2456200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2456300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2456400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2456500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2456600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2456700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2456800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2456900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2457000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2457100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2457200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2457300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2457400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2457500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2457600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2457700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2457800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2457900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2458000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0767, + "step": 2458100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0451, + "step": 2458200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2458300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2458400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2458500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2458600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2458700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2458800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2458900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2459000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2459100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2459200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2459300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2459400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 2459500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2459600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2459700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2459800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2459900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0481, + "step": 2460000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0302581787109375, + "eval_runtime": 3388.8848, + "eval_samples_per_second": 331.886, + "eval_steps_per_second": 20.743, + "step": 2460000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2460100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2460200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2460300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2460400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2460500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2460600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2460700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2460800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2460900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2461000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2461100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2461200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2461300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2461400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0287, + "step": 2461500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2461600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2461700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2461800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2461900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2462000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2462100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2462200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2462300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2462400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2462500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2462600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2462700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 2462800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2462900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2463000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2463100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2463200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2463300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2463400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2463500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2463600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2463700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2463800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2463900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 2464000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2464100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 2464200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2464300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2464400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2464500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2464600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2464700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2464800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2464900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2465000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2465100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2465200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2465300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2465400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2465500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2465600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2465700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2465800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 2465900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 2466000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2466100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2466200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2466300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2466400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2466500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2466600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2466700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2466800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2466900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2467000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2467100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2467200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2467300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2467400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2467500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2467600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2467700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2467800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2467900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2468000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2468100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2468200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2468300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2468400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2468500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2468600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2468700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 2468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2468900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2469000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2469100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2469200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2469300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2469400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2469500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2469600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2469700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2469800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2469900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2470000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2470100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2470200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2470300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2470400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2470500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2470600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2470700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2470800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2470900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2471000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2471100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2471200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2471300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2471400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2471500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2471600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2471700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2471800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2471900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2472000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 2472100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 2472200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2472300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2472400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2472500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2472600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2472700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2472800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2472900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2473000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2473100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 2473200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2473300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2473400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2473500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2473600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2473700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2473800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2473900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2474000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2474100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2474200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2474300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2474400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2474500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2474600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2474700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2474800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2474900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2475000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2475100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 2475200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2475300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2475400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0593, + "step": 2475500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2475600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2475700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2475800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2475900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2476000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2476100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2476200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2476300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2476400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2476500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2476600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2476700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2476800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2476900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2477000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2477100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2477200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2477300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2477400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2477500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2477600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2477700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2477800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2477900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2478000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2478100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2478200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2478300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 2478400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2478500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2478600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2478700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2478800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2478900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2479000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2479100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2479200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2479300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2479400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0436, + "step": 2479500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2479600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2479700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2479800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2479900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2480000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283966064453125, + "eval_runtime": 3390.2297, + "eval_samples_per_second": 331.754, + "eval_steps_per_second": 20.735, + "step": 2480000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2480100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2480200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2480300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2480400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0386, + "step": 2480500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 2480600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0454, + "step": 2480700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 2480800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 2480900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2481000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2481100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2481200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2481300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2481400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2481500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2481600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2481700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2481800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2481900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2482000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2482100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2482200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2482300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2482400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2482500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2482600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2482700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2482800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2482900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2483000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2483100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2483200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2483300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2483400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2483500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2483600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2483700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2483800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2483900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2484000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2484100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2484200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2484300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2484400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2484500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2484600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2484700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2484800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2484900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2485000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2485100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2485200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2485300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2485400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2485500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2485600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2485700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2485800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2485900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2486000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2486100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2486200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2486300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2486400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2486500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 2486600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2486700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2486800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2486900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2487000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2487100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2487200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2487300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2487400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2487500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2487600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2487700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2487800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2487900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2488000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2488100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2488200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2488300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2488400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2488500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2488600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2488700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2488800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2488900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2489000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2489100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2489200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2489300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2489400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2489500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2489600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2489700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2489800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2489900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2490000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2490100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2490200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2490300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2490400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2490500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2490600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2490700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2490800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2490900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2491000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0401, + "step": 2491100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2491200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2491300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2491400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2491500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2491600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2491700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2491800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2491900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2492000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2492100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2492200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2492300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2492400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2492500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2492600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2492700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2492800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2492900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2493000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2493100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2493200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2493300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2493400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2493500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2493600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2493700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2493800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2493900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2494000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0405, + "step": 2494100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1806, + "step": 2494200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2494300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2494400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2494500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2494600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2494700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2494800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2494900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2495000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2495100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2495200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2495300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2495400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2495500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2495600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2495700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2495800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2495900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2496000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2496100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2496200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2496300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2496400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2496500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2496600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2496700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2496800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2496900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2497000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2497100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2497200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2497300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2497400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2497500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2497600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2497700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2497800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2497900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2498000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2498100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2498200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2498300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2498400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2498500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2498600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2498700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2498800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2498900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2499000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2499100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2499200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2499300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2499400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2499500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2499600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2499700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2499800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2499900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2500000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0286407470703125, + "eval_runtime": 3397.6229, + "eval_samples_per_second": 331.032, + "eval_steps_per_second": 20.69, + "step": 2500000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2500100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2500200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2500300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2500400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2500500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2500600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2500700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2500800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2500900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2501000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2501100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2501200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2501300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2501400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2501500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2501600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2501700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2501800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2501900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2502000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2502100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2502200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2502300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2502400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2502500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2502600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2502700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2502800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2502900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2503000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2503100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2503200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2503300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2503400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2503500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2503600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2503700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2503800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2503900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2504000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2504100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2504200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2504300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2504400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2504500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2504600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2504700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2504800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2504900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2505000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2505100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2505200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2505300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2505400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2505500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2505600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2505700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2505800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2505900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2506000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2506100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2506200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2506300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2506400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2506500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2506600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2506700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2506800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2506900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2507000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2507100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2507200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2507300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2507400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2507500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2507600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2507700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2507800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2507900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2508000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2508100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2508200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2508300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2508400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2508500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2508600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2508700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2508800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2508900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2509000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2509100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2509200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2509300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2509400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2509500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2509600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2509700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2509800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2509900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2510000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2510100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2510200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2510300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2510400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2510500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2510600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2510700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2510800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2510900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2511000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2511100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2511200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2511300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2511400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2511500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2511600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2511700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2511800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2511900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2512000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2512100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2512200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0285, + "step": 2512300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2512400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2512500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2512600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2512700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2512800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2512900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2513000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2513100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2513200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2513300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2513400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2513500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2513600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2513700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2513800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2513900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2514000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2514100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2514200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2514300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2514400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2514500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2514600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2514700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2514800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2514900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2515000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0531, + "step": 2515100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2515200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2515300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2515400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2515500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2515600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2515700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 2515800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2515900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2516000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2516100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2516200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2516300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2516400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2516500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2516600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2516700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2516800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2516900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2517000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 2517100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2517200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2517300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2517400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2517500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2517600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2517700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2517800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2517900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2518000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2518100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2518200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2518300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2518400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2518500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2518600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2518700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2518800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2518900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2519000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2519100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2519200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2519300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2519400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2519500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2519600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2519700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2519800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2519900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2520000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0289154052734375, + "eval_runtime": 3357.6743, + "eval_samples_per_second": 334.971, + "eval_steps_per_second": 20.936, + "step": 2520000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2520100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2520200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2520300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2520400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2520500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2520600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2520700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2520800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2520900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2521000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2521100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2521200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2521300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2521400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2521500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2521600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2521700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2521800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2521900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2522000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2522100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2522200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2522300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2522400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2522500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2522600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2522700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2522800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2522900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2523000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2523100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2523200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2523300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2523400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2523500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2523600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2523700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2523800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2523900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2524000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2524100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2524200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2524300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2524400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2524500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2524600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2524700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2524800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2524900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2525000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 2525100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2525200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2525300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0736, + "step": 2525400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2525500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2525600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2525700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2525800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2525900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2526000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2526100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2526200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2526300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2526400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2526500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2526600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2526700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2526800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2526900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2527000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2527100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2527200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2527300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2527400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2527500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2527600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2527700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2527800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2527900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2528000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2528100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2528200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2528300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2528400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2528500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2528600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2528700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2528800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2528900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2529000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2529100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2529200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2529300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2529400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2529500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2529600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2529700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2529800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2529900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2530000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2530100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2530200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2530300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2530400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2530500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2530600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2530700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2530800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2530900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2531000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2531100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2531200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2531300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2531400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2531500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2531600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2531700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2531800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2531900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2532000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2532100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2532200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2532300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2532400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2532500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2532600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2532700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2532800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2532900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2533000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2533100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2533200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2533300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2533400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2533500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2533600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2533700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2533800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2533900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2534000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2534100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2534200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2534300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2534400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2534500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2534600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2534700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2534800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2534900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2535000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2535100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2535200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2535300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2535400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2535500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2535600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2535700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2535800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2535900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2536000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2536100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2536200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2536300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2536400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2536500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2536600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2536700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2536800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2536900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2537000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2537100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2537200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2537300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2537400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2537500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2537600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2537700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2537800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2537900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2538000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2538100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0418, + "step": 2538200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2538300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2538400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2538500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2538600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2538700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2538800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2538900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2539000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2539100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2539200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2539300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2539400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2539500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2539600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2539700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2539800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2539900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2540000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0288848876953125, + "eval_runtime": 3563.5068, + "eval_samples_per_second": 315.623, + "eval_steps_per_second": 19.727, + "step": 2540000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2540100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2540200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2540300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2540400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2540500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2540600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2540700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2540800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2540900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2541000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2541100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2541200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2541300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2541400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2541500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2541600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2541700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2541800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2541900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2542000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0352, + "step": 2542100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2542200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2542300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2542400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2542500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2542600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2542700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2542800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2542900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2543000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2543100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2543200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2543300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2543400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2543500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2543600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2543700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2543800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2543900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2544000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2544100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2544200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2544300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2544400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2544500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2544600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2544700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2544800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2544900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2545000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2545100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2545200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2545300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2545400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2545500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2545600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2545700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2545800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2545900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2546000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2546100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2546200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2546300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2546400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2546500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2546600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2546700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2546800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2546900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2547000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2547100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2547200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2547300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2547400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2547500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2547600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2547700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2547800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2547900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2548000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2548100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2548200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2548300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2548400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2548500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2548600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2548700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2548800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2548900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2549000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2549100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2549200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2549300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2549400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2549500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2549600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2549700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2549800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2549900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2550000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2550100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2550200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2550300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2550400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2550500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2550600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2550700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2550800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2550900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2551000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2551100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2551200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2551300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2551400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2551500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2551600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2551700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 2551800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2551900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2552000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2552100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2552200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2552300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2552400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2552500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2552600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2552700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2552800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2552900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2553000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2553100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2553200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2553300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2553400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2553500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2553600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2553700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2553800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2553900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2554000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2554100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2554200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2554300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2554400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2554500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2554600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2554700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2554800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2554900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2555000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2555100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2555200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2555300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2555400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2555500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2555600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2555700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2555800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2555900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2556000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2556100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2556200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2556300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 2556400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2556500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2556600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2556700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2556800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2556900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2557000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2557100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2557200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2557300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2557400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2557500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2557600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2557700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2557800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2557900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2558000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2558100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2558200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2558300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2558400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2558500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2558600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2558700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2558800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2558900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2559000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2559100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2559200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2559300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2559400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2559500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2559600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2559700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2559800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2559900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2560000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028411865234375, + "eval_runtime": 3241.7894, + "eval_samples_per_second": 346.945, + "eval_steps_per_second": 21.684, + "step": 2560000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2560100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2560200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2560300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2560400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2560500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2560600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2560700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2560800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2560900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2561000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2561100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2561200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2561300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2561400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2561500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2561600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2561700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2561800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2561900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2562000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2562100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2562200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2562300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2562400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2562500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2562600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2562700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2562800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2562900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2563000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2563100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2563200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2563300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2563400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2563500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2563600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2563700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2563800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2563900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2564000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2564100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2564200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2564300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2564400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2564500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2564600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2564700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2564800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2564900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2565000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2565100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2565200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2565300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2565400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2565500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2565600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2565700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2565800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2565900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2566000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2566100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2566200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2566300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2566400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2566500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2566600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2566700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 2566800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2566900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2567000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2567100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2567200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2567300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2567400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2567500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2567600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2567700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2567800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2567900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2568000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2568100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2568200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2568300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2568400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2568500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2568600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2568700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2568800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2568900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2569000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2569100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2569200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2569300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2569400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2569500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2569600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2569700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2569800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2569900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2570000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2570100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2570200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2570300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2570400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2570500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2570600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2570700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2570800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2570900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2571000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2571100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2571200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2571300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2571400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2571500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2571600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2571700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2571800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2571900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2572000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2572100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2572200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2572300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2572400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2572500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2572600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2572700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2572800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2572900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2573000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2573100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2573200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2573300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2573400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2573500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2573600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2573700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2573800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2573900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2574000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2574100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2574200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2574300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2574400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2574500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2574600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2574700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2574800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2574900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2575000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2575100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2575200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2575300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2575400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2575500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2575600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2575700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2575800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2575900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2576000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2576100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2576200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2576300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2576400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2576500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2576600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2576700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2576800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2576900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2577000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2577100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2577200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2577300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2577400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2577500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2577600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 2577700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2577800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2577900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2578000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2578100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2578200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2578300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2578400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2578500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2578600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2578700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2578800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2578900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2579000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2579100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2579200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2579300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2579400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2579500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2579600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2579700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2579800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 2579900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2580000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283966064453125, + "eval_runtime": 3042.109, + "eval_samples_per_second": 369.718, + "eval_steps_per_second": 23.108, + "step": 2580000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2580100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2580200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2580300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2580400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2580500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2580600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2580700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2580800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2580900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2581000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2581100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2581200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2581300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2581400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2581500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2581600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2581700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2581800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2581900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2582000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2582100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2582200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2582300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2582400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2582500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2582600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2582700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2582800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2582900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2583000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2583100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2583200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2583300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2583400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2583500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2583600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2583700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2583800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2583900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 2584000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2584100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2584200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2584300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2584400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2584500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2584600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2584700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2584800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2584900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2585000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2585100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2585200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2585300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2585400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2585500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2585600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2585700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2585800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2585900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2586000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2586100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2586200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 2586300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2586400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2586500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2586600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2586700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2586800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2586900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2587000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2587100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2587200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2587300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2587400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2587500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2587600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2587700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2587800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2587900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2588000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2588100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2588200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2588300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2588400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2588500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0284, + "step": 2588600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2588700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2588800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2588900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2589000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2589100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2589200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2589300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2589400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2589500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2589600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2589700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2589800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2589900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2590000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2590100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2590200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2590300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2590400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2590500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2590600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2590700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2590800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2590900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2591000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2591100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2591200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2591300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2591400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2591500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2591600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2591700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2591800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2591900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2592000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2592100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2592200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2592300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2592400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2592500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2592600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2592700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2592800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2592900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2593000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2593100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2593200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2593300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2593400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2593500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2593600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2593700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2593800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2593900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2594000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2594100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2594200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2594300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2594400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2594500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2594600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2594700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2594800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2594900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2595000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2595100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2595200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2595300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2595400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2595500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2595600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2595700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2595800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2595900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2596000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2596100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2596200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2596300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2596400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2596500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2596600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2596700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2596800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2596900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2597000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2597100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2597200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2597300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2597400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2597500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0398, + "step": 2597600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 2597700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2597800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2597900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2598000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2598100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2598200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2598300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2598400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2598500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2598600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2598700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2598800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2598900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2599000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2599100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2599200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2599300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2599400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2599500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2599600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2599700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2599800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2599900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2600000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0279998779296875, + "eval_runtime": 3070.9376, + "eval_samples_per_second": 366.247, + "eval_steps_per_second": 22.891, + "step": 2600000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2600100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2600200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2600300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2600400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2600500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2600600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2600700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2600800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2600900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2601000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2601100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2601200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2601300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2601400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2601500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2601600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0403, + "step": 2601700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2601800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2601900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2602000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2602100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2602200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2602300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2602400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2602500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2602600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2602700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2602800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2602900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2603000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2603100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2603200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2603300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2603400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2603500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2603600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2603700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2603800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2603900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2604000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2604100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2604200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2604300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2604400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2604500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2604600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2604700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2604800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2604900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2605000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2605100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2605200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2605300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2605400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2605500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2605600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2605700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2605800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2605900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2606000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2606100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2606200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2606300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2606400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2606500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2606600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2606700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2606800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2606900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2607000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2607100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2607200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2607300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2607400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2607500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2607600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2607700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2607800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2607900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2608000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2608100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2608200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2608300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2608400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2608500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2608600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2608700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2608800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2608900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2609000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2609100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2609200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2609300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2609400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2609500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2609600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2609700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2609800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2609900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2610000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2610100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2610200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2610300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2610400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2610500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2610600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2610700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2610800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2610900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2611000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2611100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2611200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2611300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2611400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2611500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2611600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2611700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2611800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0472, + "step": 2611900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2612000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2612100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2612200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2612300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2612400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2612500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2612600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2612700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2612800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2612900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2613000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2613100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2613200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2613300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2613400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2613500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2613600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2613700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2613800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2613900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2614000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2614100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2614200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2614300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2614400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2614500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2614600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2614700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2614800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2614900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2615000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2615100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2615200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2615300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2615400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2615500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2615600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2615700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2615800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2615900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2616000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2616100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2616200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2616300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2616400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2616500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2616600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2616700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2616800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2616900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2617000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2617100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2617200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2617300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2617400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2617500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2617600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2617700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2617800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2617900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2618000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 2618100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2618200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 2618300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.038, + "step": 2618400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2618500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2618600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2618700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2618800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2618900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2619000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2619100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2619200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2619300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2619400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2619500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2619600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2619700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2619800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2619900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2620000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028076171875, + "eval_runtime": 3158.1346, + "eval_samples_per_second": 356.135, + "eval_steps_per_second": 22.259, + "step": 2620000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2620100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2620200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2620300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2620400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2620500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2620600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2620700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2620800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2620900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2621000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2621100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2621200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2621300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2621400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2621500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2621600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2621700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2621800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2621900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2622000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2622100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2622200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2622300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2622400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2622500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2622600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2622700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2622800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2622900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2623000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2623100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2623200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 2623300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 2623400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2623500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2623600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2623700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2623800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2623900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2624000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2624100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2624200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2624300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0367, + "step": 2624400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2624500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2624600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2624700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2624800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2624900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2625000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2625100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2625200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2625300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2625400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2625500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2625600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2625700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2625800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2625900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2626000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2626100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2626200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2626300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2626400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2626500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2626600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2626700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2626800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2626900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2627000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2627100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2627200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2627300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2627400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2627500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2627600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2627700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2627800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2627900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2628000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2628100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2628200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2628300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2628400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2628500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2628600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2628700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2628800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2628900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2629000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2629100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2629200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2629300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2629400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2629500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2629600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2629700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2629800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2629900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2630000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2630100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2630200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2630300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2630400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2630500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2630600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2630700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2630800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2630900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2631000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2631100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2631200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2631300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2631400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2631500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2631600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2631700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2631800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2631900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2632000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 2632100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2632200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2632300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2632400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2632500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2632600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2632700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2632800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2632900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2633000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2633100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0286, + "step": 2633200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2633300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2633400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2633500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2633600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2633700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2633800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2633900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2634000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2634100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2634200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2634300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2634400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2634500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2634600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2634700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2634800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2634900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2635000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2635100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2635200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2635300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2635400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2635500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2635600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2635700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2635800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2635900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2636000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2636100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2636200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2636300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2636400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2636500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2636600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2636700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2636800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2636900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2637000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2637100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2637200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2637300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2637400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2637500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2637600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2637700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2637800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2637900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2638000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2638100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2638200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2638300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2638400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2638500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2638600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2638700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2638800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2638900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2639000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2639100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2639200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2639300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2639400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2639500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2639600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2639700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2639800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2639900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2640000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0287933349609375, + "eval_runtime": 3157.0248, + "eval_samples_per_second": 356.26, + "eval_steps_per_second": 22.267, + "step": 2640000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2640100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2640200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2640300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2640400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2640500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2640600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2640700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2640800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2640900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2641000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2641100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2641200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2641300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2641400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2641500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2641600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2641700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2641800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2641900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2642000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2642100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2642200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2642300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2642400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2642500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2642600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2642700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2642800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2642900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2643000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2643100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2643200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2643300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2643400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2643500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2643600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2643700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2643800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2643900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2644000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2644100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.071, + "step": 2644200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2644300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2644400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2644500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2644600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2644700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2644800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2644900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2645000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2645100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2645200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2645300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2645400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2645500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2645600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2645700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2645800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2645900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2646000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2646100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2646200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2646300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2646400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2646500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2646600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2646700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2646800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2646900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2647000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2647100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2647200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 2647300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2647400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2647500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2647600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2647700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2647800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2647900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2648000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2648100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 2648200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2648300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2648400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2648500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2648600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2648700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2648800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2648900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2649000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2649100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2649200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2649300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2649400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2649500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2649600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2649700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2649800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2649900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2650000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2650100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2650200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2650300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2650400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2650500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2650600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2650700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2650800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2650900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2651000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2651100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2651200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2651300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2651400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2651500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2651600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2651700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2651800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2651900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2652000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2652100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2652200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2652300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2652400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2652500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2652600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2652700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2652800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2652900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2653000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2653100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2653200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2653300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2653400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2653500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2653600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2653700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2653800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2653900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2654000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2654100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2654200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2654300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2654400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2654500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2654600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2654700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2654800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2654900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2655000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2655100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2655200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2655300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2655400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2655500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2655600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2655700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2655800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2655900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2656000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 2656100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2656200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2656300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2656400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2656500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2656600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2656700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2656800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2656900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2657000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2657100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2657200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2657300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2657400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2657500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2657600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2657700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2657800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2657900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2658000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2658100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2658200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2658300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2658400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2658500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2658600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2658700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2658800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2658900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2659000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2659100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2659200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2659300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2659400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2659500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2659600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2659700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2659800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2659900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2660000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283203125, + "eval_runtime": 3160.9552, + "eval_samples_per_second": 355.817, + "eval_steps_per_second": 22.239, + "step": 2660000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2660100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2660200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2660300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2660400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2660500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2660600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2660700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2660800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2660900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2661000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2661100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2661200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2661300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2661400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2661500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2661600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2661700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2661800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2661900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2662000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2662100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2662200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2662300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2662400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2662500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2662600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2662700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 2662800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0571, + "step": 2662900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2663000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2663100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2663200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2663300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2663400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2663500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2663600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2663700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2663800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2663900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2664000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2664100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2664200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2664300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2664400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2664500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2664600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2664700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2664800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2664900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2665000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2665100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2665200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2665300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2665400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2665500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2665600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2665700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2665800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2665900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2666000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2666100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2666200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2666300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2666400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2666500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2666600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2666700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2666800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 2666900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2667000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2667100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2667200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2667300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2667400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2667500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2667600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2667700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2667800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2667900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2668000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2668100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2668200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2668300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2668400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2668500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2668600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2668700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2668800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2668900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2669000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2669100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2669200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2669300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2669400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2669500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2669600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2669700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2669800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2669900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2670000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2670100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2670200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2670300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2670400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2670500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2670600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2670700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2670800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2670900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2671000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2671100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2671200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2671300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2671400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2671500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2671600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2671700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2671800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2671900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2672000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2672100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2672200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2672300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2672400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2672500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2672600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2672700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2672800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2672900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2673000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2673100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2673200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2673300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2673400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2673500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2673600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2673700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2673800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2673900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2674000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2674100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2674200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2674300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2674400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2674500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2674600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2674700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2674800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2674900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2675000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2675100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2675200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2675300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2675400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2675500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2675600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2675700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2675800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2675900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2676000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2676100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2676200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2676300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2676400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2676500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2676600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2676700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2676800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2676900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2677000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2677100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2677200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2677300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2677400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2677500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2677600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2677700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2677800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2677900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2678000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2678100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2678200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2678300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2678400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2678500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2678600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2678700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2678800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2678900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2679000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2679100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2679200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2679300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2679400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2679500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2679600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2679700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2679800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2679900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2680000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0281524658203125, + "eval_runtime": 3109.0725, + "eval_samples_per_second": 361.755, + "eval_steps_per_second": 22.61, + "step": 2680000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2680100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2680200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2680300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2680400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2680500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2680600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2680700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2680800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2680900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2681000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2681100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2681200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2681300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2681400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2681500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2681600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2681700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2681800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2681900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2682000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2682100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2682200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2682300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2682400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2682500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2682600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2682700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2682800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2682900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2683000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2683100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2683200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2683300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2683400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2683500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2683600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2683700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2683800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2683900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2684000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2684100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2684200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2684300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2684400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2684500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2684600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2684700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2684800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2684900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2685000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2685100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2685200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2685300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2685400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2685500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2685600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2685700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2685800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2685900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2686000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2686100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2686200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2686300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2686400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2686500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2686600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2686700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2686800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 2686900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2687000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2687100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 2687200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2687300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2687400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2687500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2687600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2687700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2687800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2687900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2688000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2688100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2688200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2688300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2688400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2688500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2688600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2688700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2688800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2688900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2689000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2689100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2689200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2689300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2689400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2689500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2689600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2689700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2689800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2689900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2690000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2690100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2690200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2690300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2690400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2690500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2690600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2690700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2690800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2690900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2691000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2691100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2691200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2691300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2691400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2691500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2691600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2691700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2691800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2691900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2692000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2692100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2692200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2692300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2692400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2692500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2692600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2692700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2692800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2692900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2693000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2693100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2693200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2693300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2693400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2693500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2693600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2693700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2693800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2693900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2694000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2694100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2694200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2694300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2694400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2694500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2694600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2694700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2694800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2694900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2695000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2695100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2695200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2695300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2695400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2695500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2695600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2695700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2695800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2695900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2696000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2696100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2696200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2696300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2696400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2696500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2696600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2696700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2696800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2696900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2697000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2697100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2697200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2697300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2697400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2697500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2697600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2697700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2697800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2697900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2698000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2698100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2698200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2698300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2698400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2698500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2698600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2698700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2698800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2698900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2699000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2699100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2699200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2699300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2699400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2699500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2699600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2699700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2699800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2699900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2700000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028289794921875, + "eval_runtime": 3653.0406, + "eval_samples_per_second": 307.887, + "eval_steps_per_second": 19.243, + "step": 2700000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2700100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2700200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2700300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2700400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2700500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2700600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2700700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2700800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2700900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2701000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2701100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2701200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2701300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2701400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2701500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2701600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2701700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2701800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2701900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2702000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2702100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2702200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2702300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2702400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2702500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2702600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2702700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2702800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2702900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2703000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2703100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2703200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2703300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2703400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2703500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2703600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2703700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2703800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2703900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2704000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2704100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2704200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2704300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2704400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2704500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2704600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0285, + "step": 2704700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2704800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2704900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2705000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2705100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2705200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2705300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2705400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2705500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2705600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2705700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2705800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2705900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2706000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2706100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2706200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2706300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2706400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2706500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2706600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2706700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2706800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2706900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2707000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2707100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2707200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2707300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2707400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2707500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2707600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2707700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2707800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2707900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2708000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2708100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2708200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2708300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2708400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2708500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2708600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2708700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2708800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2708900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2709000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2709100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2709200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2709300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2709400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2709500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2709600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2709700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2709800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2709900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2710000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2710100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2710200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2710300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2710400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2710500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2710600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2710700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2710800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2710900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2711000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2711100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2711200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2711300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2711400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2711500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2711600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2711700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2711800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2711900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2712000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2712100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2712200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2712300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2712400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2712500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2712600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2712700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2712800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2712900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2713000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2713100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2713200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2713300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2713400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2713500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2713600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2713700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2713800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2713900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2714000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2714100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2714200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2714300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2714400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2714500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2714600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2714700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2714800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2714900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2715000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2715100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2715200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2715300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2715400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2715500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2715600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2715700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2715800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2715900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2716000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2716100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2716200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2716300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2716400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2716500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2716600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2716700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2716800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2716900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2717000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2717100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2717200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2717300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2717400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2717500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2717600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2717700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2717800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2717900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2718000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2718100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2718200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2718300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2718400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2718500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2718600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2718700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2718800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2718900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2719000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2719100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2719200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2719300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2719400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2719500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2719600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2719700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 2719800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0416, + "step": 2719900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2720000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028289794921875, + "eval_runtime": 3544.2998, + "eval_samples_per_second": 317.333, + "eval_steps_per_second": 19.834, + "step": 2720000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2720100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2720200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2720300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2720400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2720500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2720600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2720700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2720800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2720900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2721000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2721100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2721200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2721300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2721400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2721500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2721600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2721700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2721800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2721900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2722000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2722100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2722200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2722300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2722400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2722500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2722600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2722700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2722800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2722900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2723000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2723100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2723200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2723300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2723400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2723500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2723600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2723700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2723800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2723900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2724000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2724100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2724200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2724300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2724400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2724500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2724600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2724700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2724800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2724900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2725000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2725100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2725200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2725300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2725400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2725500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2725600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2725700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2725800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2725900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2726000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2726100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2726200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2726300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2726400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2726500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2726600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2726700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2726800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2726900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2727000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2727100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2727200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2727300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2727400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2727500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 2727600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2727700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2727800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2727900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2728000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2728100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2728200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2728300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2728400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2728500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2728600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2728700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2728800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2728900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2729000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2729100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2729200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2729300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2729400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2729500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2729600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2729700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2729800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2729900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2730000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2730100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2730200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2730300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2730400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2730500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2730600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2730700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2730800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2730900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2731000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2731100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2731200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2731300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2731400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2731500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2731600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2731700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2731800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0461, + "step": 2731900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0532, + "step": 2732000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2732100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2732200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2732300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2732400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2732500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2732600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2732700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2732800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2732900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2733000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2733100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2733200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2733300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2733400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2733500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2733600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2733700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2733800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2733900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2734000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2734100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2734200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2734300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2734400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2734500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2734600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2734700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2734800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2734900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2735000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2735100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2735200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2735300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2735400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0391, + "step": 2735500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2735600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2735700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2735800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2735900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2736000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2736100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2736200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2736300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2736400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2736500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2736600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2736700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2736800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2736900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2737000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2737100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2737200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2737300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2737400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2737500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2737600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2737700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2737800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2737900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2738000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2738100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2738200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2738300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2738400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2738500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2738600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2738700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2738800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2738900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2739000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2739100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2739200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2739300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2739400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2739500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2739600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2739700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2739800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2739900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2740000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028472900390625, + "eval_runtime": 3617.2028, + "eval_samples_per_second": 310.937, + "eval_steps_per_second": 19.434, + "step": 2740000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2740100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2740200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2740300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2740400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2740500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2740600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2740700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2740800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2740900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2741000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2741100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2741200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2741300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2741400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2741500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2741600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2741700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2741800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2741900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2742000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2742100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2742200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2742300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2742400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2742500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2742600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2742700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2742800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2742900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2743000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2743100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2743200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2743300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2743400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2743500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2743600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2743700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2743800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2743900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2744000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2744100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2744200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2744300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2744400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2744500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2744600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2744700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2744800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2744900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2745000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2745100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2745200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2745300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2745400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2745500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2745700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2745800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2745900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2746000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2746100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2746200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2746300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2746400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2746500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2746600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2746700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2746800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2746900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2747000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2747100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2747200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2747300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2747400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2747500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2747600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2747700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2747800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2747900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2748000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2748100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2748200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2748300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2748400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2748500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2748600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2748700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2748800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2748900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2749000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2749100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2749200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2749300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2749400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2749500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2749600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2749700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2749800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2749900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2750000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2750100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2750200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2750300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2750400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2750500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2750600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2750700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2750800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2750900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2751000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2751100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2751200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2751300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2751400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2751500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2751600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2751700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2751800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2751900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2752000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2752100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2752200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2752300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2752400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2752500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2752600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2752700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2752800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2752900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2753000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2753100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2753200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2753300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2753400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2753500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2753600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2753700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2753800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2753900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2754000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2754100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2754200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2754300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2754400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2754500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2754600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2754700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2754800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2754900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2755000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2755100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2755200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2755300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2755400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2755500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2755600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2755700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2755800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2755900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2756000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2756100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2756200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2756300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2756400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2756500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2756600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2756700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2756800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2756900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2757000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2757100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2757200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2757300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2757400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2757500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2757600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2757700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2757800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2757900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2758000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2758100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2758200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2758300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2758400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2758500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2758600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2758700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2758800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2758900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2759000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2759100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2759200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2759300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2759400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2759500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2759600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2759700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2759800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2759900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2760000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284576416015625, + "eval_runtime": 3352.5955, + "eval_samples_per_second": 335.478, + "eval_steps_per_second": 20.968, + "step": 2760000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2760100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2760200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2760300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2760400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2760500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2760600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2760700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2760800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2760900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2761000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2761100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2761200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2761300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2761400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2761500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2761600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2761700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2761800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2761900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2762000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2762100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2762200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2762300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2762400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2762500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2762600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2762700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2762800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2762900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2763000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2763100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2763200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2763300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2763400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2763500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2763600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2763700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2763800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2763900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2764000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2764100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2764200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2764300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2764400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2764500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2764600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2764700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2764800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2764900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2765000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2765100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2765200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2765300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2765400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2765500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2765600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2765700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2765800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2765900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2766000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2766100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2766200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2766300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2766400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2766500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2766600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2766700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2766800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2766900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2767000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2767100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2767200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2767300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2767400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2767500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2767600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2767700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2767800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2767900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2768000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2768100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2768200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2768300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2768400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2768500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2768600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2768700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2768800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2768900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2769000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2769100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2769200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2769300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2769400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2769500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2769600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2769700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2769800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2769900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2770000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2770100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2770200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2770300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2770400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2770500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2770600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2770700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2770800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2770900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2771000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2771100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2771200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2771300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2771400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2771500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2771600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2771700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2771800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2771900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2772000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2772100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2772200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2772300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2772400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2772500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2772600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2772700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2772800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2772900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2773000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2773100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2773200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2773300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2773400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2773500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2773600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2773700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2773800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2773900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2774000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2774100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2774200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2774300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2774400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2774500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2774600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2774700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2774800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2774900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2775000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2775100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2775200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2775300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2775400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2775500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2775600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2775700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2775800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2775900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2776000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2776100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2776200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2776300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2776400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2776500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2776600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2776700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2776800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2776900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2777000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2777100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2777200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2777300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2777400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2777500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2777600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2777700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2777800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2777900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2778000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2778100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2778200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2778300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2778400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2778500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2778600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2778700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2778800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2778900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2779000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2779100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2779200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2779300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2779400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2779500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2779600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2779700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2779800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2779900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2780000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0281829833984375, + "eval_runtime": 3210.4835, + "eval_samples_per_second": 350.328, + "eval_steps_per_second": 21.896, + "step": 2780000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2780100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2780200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2780300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2780400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2780500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2780600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2780700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2780800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2780900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2781000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2781100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2781200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2781300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2781400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2781500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2781600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2781700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2781800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2781900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2782000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2782100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2782200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2782300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2782400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2782500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2782600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2782700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2782800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2782900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2783000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2783100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2783200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2783300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2783400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2783500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2783600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2783700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2783800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2783900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2784000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2784100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2784200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2784300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2784400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2784500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2784600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2784700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2784800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2784900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2785000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2785100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2785200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2785300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 2785400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2785500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2785600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2785700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2785800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2785900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2786000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2786100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2786200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2786300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2786400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2786500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2786600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2786700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2786800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2786900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 2787000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2787100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2787200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2787300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2787400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2787500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2787600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2787700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2787800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2787900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2788000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2788100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2788200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2788300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2788400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2788500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2788600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2788700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2788800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2788900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2789000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2789100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2789200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2789300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2789400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2789500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2789600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2789700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2789800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2789900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2790000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2790100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2790200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2790300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2790400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2790500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2790600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2790700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2790800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2790900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2791000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2791100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2791200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2791300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2791400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2791500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2791600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2791700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2791800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2791900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2792000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2792100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2792200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2792300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2792400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2792500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2792600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2792700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2792800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2792900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2793000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2793100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2793200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2793300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2793400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2793500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2793600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2793700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2793800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2793900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2794000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2794100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2794200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2794300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2794400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2794500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2794600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2794700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2794800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2794900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2795000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2795100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2795200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2795300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2795400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2795500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2795600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2795700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2795800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2795900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2796000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2796100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2796200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2796300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2796400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2796500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2796600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2796700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2796800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2796900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2797000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2797100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2797200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2797300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2797400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2797500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2797600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2797700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2797800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2797900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2798000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2798100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2798200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2798300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2798400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2798500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2798600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2798700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2798800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2798900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2799000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2799100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2799200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2799300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2799400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2799500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2799600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2799700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2799800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2799900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0284, + "step": 2800000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283355712890625, + "eval_runtime": 3172.1198, + "eval_samples_per_second": 354.565, + "eval_steps_per_second": 22.161, + "step": 2800000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2800100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2800200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2800300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2800400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2800500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2800600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2800700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2800800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2800900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2801000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2801100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2801200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2801300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2801400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2801500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2801600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2801700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2801800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2801900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2802000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2802100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2802200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2802300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2802400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2802500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2802600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2802700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2802800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2802900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2803000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2803100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2803200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2803300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2803400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2803500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2803600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2803700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2803800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2803900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2804000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2804100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2804200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2804300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2804400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2804500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2804600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2804700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2804800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2804900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2805000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2805100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2805200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2805300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2805400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2805500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2805600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2805700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2805800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2805900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2806000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2806100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2806200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2806300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2806400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2806500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2806600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2806700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2806800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2806900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2807000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2807100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2807200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2807300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2807400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2807500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2807600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2807700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2807800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2807900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2808000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2808100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2808200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2808300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2808400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2808500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2808600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2808700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2808800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2808900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2809000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2809100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2809200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2809300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2809400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2809500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2809600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2809700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2809800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2809900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2810000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2810100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2810200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2810300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2810400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2810500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2810600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2810700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2810800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2810900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2811000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2811100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2811200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2811300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2811400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2811500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2811600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2811700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2811800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2811900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2812000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2812100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2812200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2812300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2812400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2812500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2812600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2812700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2812800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2812900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2813000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 2813100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0389, + "step": 2813200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2813300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2813400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2813500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2813600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2813700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2813800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2813900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2814000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2814100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2814200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2814300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2814400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2814500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2814600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2814700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2814800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2814900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2815000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2815100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2815200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2815300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2815400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2815500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2815600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2815700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2815800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2815900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2816000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2816100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2816200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2816300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2816400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2816500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2816600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2816700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2816800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2816900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2817000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2817100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2817200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2817300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2817400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2817500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2817600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2817700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2817800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2817900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2818000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2818100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2818200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0351, + "step": 2818300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2818400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2818500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2818600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2818700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2818800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2818900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2819000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2819100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2819200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2819300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2819400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2819500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2819600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2819700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2819800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2819900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2820000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028533935546875, + "eval_runtime": 3062.156, + "eval_samples_per_second": 367.298, + "eval_steps_per_second": 22.956, + "step": 2820000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2820100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2820200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2820300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2820400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2820500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2820600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2820700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2820800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2820900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2821000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2821100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2821200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2821300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0287, + "step": 2821400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2821500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2821600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2821700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2821800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2821900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2822000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 2822100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2822200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2822300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2822400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2822500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2822600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2822700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2822800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2822900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2823000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2823100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2823200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2823300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2823400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2823500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2823600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2823700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2823800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2823900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2824000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2824100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2824200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2824300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2824400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2824500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2824600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2824700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2824800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2824900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2825000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2825100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2825200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2825300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2825400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2825500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2825600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2825700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2825800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2825900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2826000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2826100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2826200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2826300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2826400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2826500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2826600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2826700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2826800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2826900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2827000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2827100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2827200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2827300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2827400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2827500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2827600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2827700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2827800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2827900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2828000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2828100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2828200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2828300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2828400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2828500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2828600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2828700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2828800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2828900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2829000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2829100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2829200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2829300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2829400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2829500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2829600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2829700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2829800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2829900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2830000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2830100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2830200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2830300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2830500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2830600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2830700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2830800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2830900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2831000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2831100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2831200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2831300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2831400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2831500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2831600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2831700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2831800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2831900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2832000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2832100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2832200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2832300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2832400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2832500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2832600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2832700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2832800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2832900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2833000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2833100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2833200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2833300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2833400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2833500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2833600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2833700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2833800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2833900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2834000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2834100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2834200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2834300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2834400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2834500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2834600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2834700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2834800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2834900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2835000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 2835100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2835200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2835300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2835400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2835500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2835600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2835700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2835800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2835900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2836000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2836100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2836200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2836300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2836400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2836500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2836600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2836700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2836800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2836900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2837000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2837100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2837200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2837300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2837400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2837500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2837600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2837700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2837800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2837900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2838000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2838100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2838200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2838300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2838400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2838500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2838600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2838700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2838800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2838900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2839000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2839100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2839200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2839300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2839400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2839500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2839600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2839700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2839800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2839900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2840000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284576416015625, + "eval_runtime": 3026.4651, + "eval_samples_per_second": 371.629, + "eval_steps_per_second": 23.227, + "step": 2840000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2840100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2840200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2840300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2840400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2840500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2840600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2840700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2840800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2840900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2841000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2841100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2841200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2841300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2841400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2841500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2841600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2841700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2841800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2841900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2842000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2842100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2842200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2842300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2842400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2842500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2842600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2842700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2842800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2842900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2843000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2843100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2843200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2843300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2843400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2843500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2843600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2843700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2843800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2843900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2844000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2844100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2844200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2844300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2844400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2844500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2844600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2844700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2844800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2844900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2845000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2845100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2845200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2845300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2845400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2845500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2845600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2845700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2845800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2845900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2846000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2846100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2846200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2846300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2846400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2846500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2846600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2846700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2846800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2846900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2847000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2847100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2847200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2847300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2847400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2847500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2847600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2847700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2847800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2847900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2848000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2848100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2848200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0586, + "step": 2848300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2848400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2848500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2848600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2848700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2848800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2848900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2849000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2849100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2849200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2849300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2849400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2849500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2849600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2849700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2849800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2849900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2850000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2850100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2850200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2850300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2850400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2850500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2850600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 2850700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2850800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2850900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2851000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2851100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2851200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2851300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2851400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2851500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2851600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2851700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2851800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2851900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2852000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2852100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2852200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2852300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2852400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2852500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2852600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2852700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2852800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2852900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2853000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2853100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2853200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2853300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2853400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2853500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2853600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2853700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 2853800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2853900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2854000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2854100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2854200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2854300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2854400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2854500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2854600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2854700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2854800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2854900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2855000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2855100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2855200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2855300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 2855400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2855500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2855600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2855700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2855800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2855900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2856000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2856100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2856200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2856300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2856400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2856500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2856600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2856700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2856800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2856900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2857000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2857100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2857200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2857300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2857400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2857500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2857600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2857700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2857800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2857900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2858000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2858100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2858200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2858300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2858400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2858500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2858600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2858700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2858800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2858900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2859000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2859100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2859200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2859300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2859400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2859500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2859600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2859700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2859800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2859900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2860000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028289794921875, + "eval_runtime": 2980.4196, + "eval_samples_per_second": 377.371, + "eval_steps_per_second": 23.586, + "step": 2860000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2860100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2860200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2860300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2860400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2860500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2860600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2860700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2860800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2860900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2861000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2861100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2861200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2861300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2861400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2861500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2861600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2861700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2861800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2861900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2862000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2862100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2862200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2862300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2862400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2862500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2862600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2862700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2862800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2862900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2863000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2863100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2863200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2863300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2863400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2863500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2863600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2863700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2863800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2863900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2864000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2864100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2864200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2864300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2864400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2864500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2864600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2864700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2864800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2864900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2865000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2865100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 2865200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2865300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2865400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2865500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 2865600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2865700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2865800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2865900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2866000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2866100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2866200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0457, + "step": 2866300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2866400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2866500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2866600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2866700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2866800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2866900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2867000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2867100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2867200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2867300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2867400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2867500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2867600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 2867700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2867800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2867900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2868000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2868100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2868200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2868300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2868400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2868500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2868600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2868700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2868800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.037, + "step": 2868900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2869000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2869100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2869200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2869300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2869400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2869500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2869600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2869700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2869800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2869900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2870000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2870100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2870200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2870300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2870400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2870500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2870600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2870700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2870800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0361, + "step": 2870900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2871000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2871100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2871200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2871300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2871400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2871500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2871600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2871700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2871800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2871900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2872000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2872100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2872200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2872300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2872400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2872500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2872600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2872700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2872800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2872900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2873000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2873100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2873200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2873300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2873400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2873500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2873600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2873700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2873800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2873900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2874000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2874100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2874200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2874300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2874400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2874500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2874600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2874700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2874800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2874900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2875000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2875100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2875200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2875300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2875400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2875500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2875600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2875700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2875800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2875900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2876000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 2876100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2876200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2876300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2876400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2876500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2876600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2876700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2876800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2876900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2877000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2877100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2877200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2877300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2877400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2877500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2877600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2877700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2877800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2877900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2878000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2878100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2878200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2878300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2878400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2878500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2878600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2878700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2878800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2878900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2879000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2879100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2879200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2879300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2879400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2879500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2879600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2879700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2879800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2879900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2880000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283660888671875, + "eval_runtime": 2966.4786, + "eval_samples_per_second": 379.144, + "eval_steps_per_second": 23.697, + "step": 2880000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2880100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2880200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2880300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2880400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2880500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2880600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2880700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2880800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2880900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2881000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2881100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2881200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2881300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2881400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2881500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2881600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2881700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2881800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2881900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2882000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2882100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2882200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2882300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2882400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2882500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2882600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2882700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2882800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2882900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2883000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2883100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2883200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2883300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2883400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2883500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2883600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2883700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2883800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2883900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2884000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2884100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2884200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2884300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2884400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2884500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2884600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2884700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2884800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2884900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2885000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2885100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2885200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2885300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2885400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2885500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2885600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2885700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2885800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2885900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2886000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2886100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2886200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2886300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2886400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2886500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2886600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2886700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2886800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2886900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2887000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2887100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2887200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2887300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2887400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2887500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2887600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2887700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2887800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2887900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2888000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2888100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2888200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2888300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2888400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2888500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2888600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2888700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2888800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2888900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2889000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2889100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2889200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2889300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2889400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2889500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2889600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2889700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2889800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2889900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2890000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2890100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2890200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2890300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2890400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2890500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2890600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2890700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2890800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2890900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2891000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2891100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2891200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2891300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2891400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2891500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2891600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2891700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2891800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2891900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2892000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2892100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2892200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2892300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2892400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2892500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0371, + "step": 2892600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2892700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2892800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2892900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2893000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2893100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2893200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2893300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2893400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2893500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2893600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2893700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2893800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2893900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2894000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2894100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2894200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2894300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2894400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2894500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2894600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2894700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2894800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2894900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2895000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2895100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2895200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2895300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2895400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2895500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2895600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2895700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2895800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2895900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2896000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2896100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2896200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2896300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2896400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2896500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2896600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2896700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2896800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2896900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2897000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2897100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2897200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2897300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2897400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2897500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2897600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2897700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2897800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2897900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2898000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2898100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2898200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2898300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2898400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2898500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2898600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2898700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2898800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2898900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2899000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2899100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2899200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2899300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2899400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2899500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2899600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2899700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2899800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2899900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2900000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283355712890625, + "eval_runtime": 2977.4801, + "eval_samples_per_second": 377.743, + "eval_steps_per_second": 23.609, + "step": 2900000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2900100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2900200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2900300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2900400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2900500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2900600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2900700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2900800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2900900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2901000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2901100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2901200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2901300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2901400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2901500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2901600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2901700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2901800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2901900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2902000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2902100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2902200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2902300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2902400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2902500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2902600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2902700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2902800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2902900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2903000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2903100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2903200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2903300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2903400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2903500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2903600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2903700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2903800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2903900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2904000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2904100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2904200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2904300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2904400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2904500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2904600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2904700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2904800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2904900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2905000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2905100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2905200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2905300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2905400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2905500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2905600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2905700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 2905800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2905900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2906000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2906100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2906200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2906300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2906400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2906500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2906600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2906700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2906800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2906900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2907000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2907100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2907200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2907300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2907400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2907500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2907600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2907700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2907800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2907900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2908000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2908100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2908200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2908300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2908400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2908500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2908600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2908700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2908800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2908900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2909000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2909100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2909200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2909300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2909400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2909500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2909600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2909700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2909800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2909900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2910000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2910100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2910200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2910300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2910400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2910500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2910600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2910700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2910800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2910900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2911000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2911100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2911200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2911300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2911400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2911500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2911600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2911700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2911800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2911900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2912000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2912100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2912200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2912300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2912400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2912500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2912600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2912700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2912800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2912900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0383, + "step": 2913000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2913100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2913200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2913300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2913400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2913500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2913600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2913700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2913800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2913900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2914000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2914100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0502, + "step": 2914200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2914300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2914400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2914500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2914600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2914700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2914800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2914900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2915000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2915100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2915200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2915300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2915400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2915500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2915600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2915700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2915800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2915900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2916000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2916100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2916200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2916300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2916400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2916500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2916600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 2916700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2916800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2916900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2917000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2917100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2917200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2917300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2917400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2917500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2917600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2917700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2917800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2917900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2918000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2918100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2918200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2918300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2918400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2918500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2918600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2918700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2918800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2918900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2919000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2919100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2919200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2919300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2919400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2919500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2919600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2919700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2919800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2919900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2920000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0279388427734375, + "eval_runtime": 2996.4079, + "eval_samples_per_second": 375.357, + "eval_steps_per_second": 23.46, + "step": 2920000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2920100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2920200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2920300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2920400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2920500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2920600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2920700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2920800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2920900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2921000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2921100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2921200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2921300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2921400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2921500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2921600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2921700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2921800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2921900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2922000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2922100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2922200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2922300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2922400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2922500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2922600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2922700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2922800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2922900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2923000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2923100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2923200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2923300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2923400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2923500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2923600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2923700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2923800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2923900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2924000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2924100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2924200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2924300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 2924400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2924500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2924600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2924700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2924800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2924900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2925000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2925100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2925200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2925300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2925400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2925500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2925600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2925700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2925800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2925900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2926000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2926100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2926200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2926300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2926400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2926500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2926600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2926700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2926800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2926900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2927000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2927100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2927200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2927300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2927400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2927500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2927600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2927700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2927800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2927900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2928000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2928100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2928200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2928300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2928400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2928500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2928600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2928700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2928800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2928900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2929000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2929100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2929200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2929300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2929400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2929500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2929600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2929700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2929800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2929900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2930000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2930100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2930200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2930300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2930400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2930500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2930600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2930700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2930800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2930900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2931000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2931100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2931200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2931300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2931400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2931500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2931600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2931700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2931800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2931900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2932000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2932100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2932200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2932300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2932400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2932500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2932600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2932700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2932800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2932900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2933000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2933100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2933200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2933300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2933400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2933500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2933600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2933700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2933800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2933900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2934000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2934100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2934200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2934300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2934400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2934500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2934600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2934700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2934800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2934900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2935000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2935100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2935200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2935300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2935400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2935500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2935600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2935700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2935800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2935900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2936000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2936100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2936200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2936300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2936400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2936500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2936600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 2936700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2936800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 2936900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2937000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 2937100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2937200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2937300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2937400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2937500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2937600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2937700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2937800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2937900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2938000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2938100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2938200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2938300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2938400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2938500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2938600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2938700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2938800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2938900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2939000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2939100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2939200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2939300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 2939400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2939500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2939600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2939700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2939800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2939900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2940000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283050537109375, + "eval_runtime": 3008.1111, + "eval_samples_per_second": 373.897, + "eval_steps_per_second": 23.369, + "step": 2940000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0445, + "step": 2940100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2940200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2940300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2940400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2940500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2940600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2940700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2940800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2940900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2941000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2941100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2941200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2941300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2941400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 2941500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2941600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2941700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2941800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2941900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2942000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2942100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2942200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2942300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2942400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2942500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2942600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2942700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2942800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2942900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2943000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2943100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2943200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2943300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2943400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2943500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2943600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2943700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 2943800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2943900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2944000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2944100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2944200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2944300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2944400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2944500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2944600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2944700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2944800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2944900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 2945000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2945100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2945200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2945300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2945400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2945500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2945600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2945700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2945800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2945900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2946000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2946100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2946200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2946300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2946400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2946500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2946600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2946700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2946800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2946900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2947000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2947100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2947200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2947300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2947400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2947500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2947600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2947700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2947800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2947900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2948000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2948100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2948200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2948300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2948400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2948500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2948600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2948700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2948800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2948900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2949000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2949100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2949200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2949300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2949400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2949500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2949600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2949700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2949800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2949900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2950000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2950100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2950200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2950300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2950400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2950500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2950600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2950700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2950800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2950900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2951000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2951100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2951200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2951300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2951400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2951500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2951600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2951700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2951800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2951900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2952000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2952100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2952200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2952300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2952400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2952500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2952600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2952700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2952800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2952900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2953000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2953100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2953200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2953300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2953400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2953500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2953600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2953700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2953800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2953900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2954000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2954100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2954200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2954300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2954400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2954500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2954600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2954700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2954800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2954900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2955000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2955100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2955200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2955300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2955400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2955500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2955600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2955700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2955800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2955900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2956000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2956100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2956200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2956300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2956400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2956500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2956600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2956700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2956800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2956900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2957000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2957100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2957200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2957300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2957400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2957500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0288, + "step": 2957600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2957700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2957800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2957900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2958000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2958100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2958200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2958300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2958400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2958500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2958600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2958700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2958800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2958900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2959000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2959100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2959200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2959300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2959400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2959500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2959600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2959700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2959800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2959900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2960000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02825927734375, + "eval_runtime": 3374.7233, + "eval_samples_per_second": 333.279, + "eval_steps_per_second": 20.83, + "step": 2960000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2960100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2960200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2960300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2960400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2960500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2960600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2960700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2960800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2960900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2961000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2961100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2961200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2961300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2961400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2961500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2961600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 2961700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2961800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2961900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2962000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2962100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2962200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2962300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2962400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2962500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2962600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2962700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2962800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2962900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2963000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2963100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2963200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2963300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2963400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2963500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2963600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2963700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2963800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2963900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2964000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2964100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2964200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2964300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2964400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2964500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2964600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2964700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2964800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2964900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2965000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2965100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2965200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2965300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2965400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2965500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2965600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2965700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2965800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2965900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2966000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2966100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2966200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2966300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2966400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2966500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2966600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2966700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2966800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2966900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2967000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2967100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2967200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2967300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2967400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2967500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2967600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2967700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2967800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2967900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2968000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2968100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2968200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2968300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2968400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2968500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2968600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2968700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2968800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2968900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2969000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2969100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2969200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2969300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2969400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2969500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2969600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2969700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2969800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2969900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2970000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2970100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2970200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2970300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2970400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2970500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2970600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2970700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2970800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2970900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2971000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2971100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0344, + "step": 2971200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2971300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2971400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2971500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2971600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2971700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0503, + "step": 2971800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0439, + "step": 2971900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 2972000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2972100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2972200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0404, + "step": 2972300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.039, + "step": 2972400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 2972500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 2972600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2972700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2972800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 2972900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2973000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2973100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2973200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2973300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2973400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2973500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2973600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2973700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2973800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2973900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2974000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 2974100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2974200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2974300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 2974400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 2974500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2974600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2974700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2974800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 2974900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2975000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2975100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2975200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2975300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2975400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2975500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2975600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2975700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2975800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2975900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 2976000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 2976100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2976200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2976300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2976400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2976500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2976600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2976700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2976800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2976900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2977000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2977100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2977200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2977300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2977400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2977500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2977600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2977700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2977800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2977900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2978000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2978100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2978200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2978300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2978400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2978500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2978600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2978700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2978800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2978900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2979000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2979100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2979200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2979300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2979400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2979500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2979600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2979700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2979800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2979900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2980000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02874755859375, + "eval_runtime": 3357.7861, + "eval_samples_per_second": 334.96, + "eval_steps_per_second": 20.935, + "step": 2980000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2980100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2980200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 2980300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2980400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2980500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2980600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2980700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2980800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2980900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2981000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2981100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2981200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2981300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2981400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2981500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2981600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2981700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2981800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2981900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2982000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2982100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0287, + "step": 2982200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2982300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2982400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0288, + "step": 2982500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 2982600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2982700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2982800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2982900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2983000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2983100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2983200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2983300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2983400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2983500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2983600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2983700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2983800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2983900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2984000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 2984100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2984200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2984300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2984400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2984500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2984600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2984700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2984800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2984900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2985000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2985100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2985200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2985300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2985400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2985500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 2985600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2985700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2985800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2985900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2986000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2986100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2986200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2986300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2986400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 2986500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2986600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2986700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2986800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2986900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2987000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2987100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2987200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2987300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2987400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2987500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2987600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2987700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2987800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2987900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2988000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2988100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2988200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2988300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2988400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2988500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2988600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2988700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2988800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2988900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2989000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2989100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 2989200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2989300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2989400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 2989500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2989600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2989700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2989800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2989900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2990000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2990100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2990200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2990300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 2990400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2990500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2990600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2990700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2990800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2990900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2991000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 2991100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2991200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2991300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2991400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2991500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2991600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2991700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 2991800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2991900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2992000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 2992100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2992200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2992300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2992400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2992500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2992600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2992700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0287, + "step": 2992800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2992900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2993000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2993100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2993200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 2993300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2993400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 2993500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2993600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2993700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0372, + "step": 2993800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2993900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 2994000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2994100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 2994200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2994300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2994400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2994500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2994600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 2994700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 2994800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2994900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 2995000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2995100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2995200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 2995300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2995400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2995500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2995600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2995700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2995800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2995900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2996000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2996100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2996200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2996300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 2996400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 2996500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 2996600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2996700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 2996800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 2996900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 2997000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 2997100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2997200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2997300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2997400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2997500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2997600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 2997700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2997800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2997900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 2998000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2998100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2998200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2998300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 2998400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2998500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 2998600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 2998700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 2998800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 2998900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 2999000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 2999100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 2999200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 2999300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 2999400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 2999500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 2999600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 2999700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 2999800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 2999900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3000000 + }, + { + "epoch": 0.0, + "eval_loss": 0.027923583984375, + "eval_runtime": 3313.1307, + "eval_samples_per_second": 339.474, + "eval_steps_per_second": 21.217, + "step": 3000000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3000100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3000200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3000300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3000400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3000500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3000600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3000700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3000800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3000900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3001000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3001100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3001200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3001300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3001400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3001500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3001600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3001700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3001800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3001900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3002000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3002100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0384, + "step": 3002200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0409, + "step": 3002300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 3002400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3002500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3002600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3002700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3002800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3002900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3003000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3003100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0283, + "step": 3003200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3003300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3003400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3003500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3003600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3003700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3003800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3003900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3004000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3004100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3004200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3004300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3004400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3004500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3004600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3004700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3004800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3004900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3005000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3005100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3005200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3005300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3005400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3005500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3005600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3005700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3005800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3005900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3006000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3006100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3006200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3006300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3006400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3006500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3006600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3006700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3006800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3006900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3007000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3007100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3007200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3007300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3007400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3007500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3007600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3007700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3007800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3007900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3008000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3008100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3008200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 3008300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3008400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3008500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3008600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3008700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3008800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3008900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 3009000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3009100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3009200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3009300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3009400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3009500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3009600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3009700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3009800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3009900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3010000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3010100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3010200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3010300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3010400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3010500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3010600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3010700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3010800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 3010900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3011000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3011100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 3011200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3011300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3011400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3011500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3011600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3011700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3011800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3011900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3012000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3012100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3012200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3012300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3012400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3012500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3012600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3012700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3012800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3012900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3013000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3013100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3013200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3013300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3013400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3013500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3013600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3013700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3013800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3013900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3014000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3014100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3014200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3014300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3014400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3014500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3014600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3014700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3014800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3014900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3015000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3015100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3015200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3015300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3015400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3015500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3015600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3015700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3015800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3015900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3016000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3016100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3016200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3016300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3016400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3016500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3016600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3016700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 3016800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3016900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3017000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3017100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3017200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3017300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3017400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3017500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3017600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3017700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3017800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3017900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3018000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0286, + "step": 3018100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3018200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3018300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3018400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3018500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3018600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3018700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3018800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3018900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3019000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3019100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3019200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3019300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3019400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3019500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3019600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 3019700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3019800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3019900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3020000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0284576416015625, + "eval_runtime": 3375.5001, + "eval_samples_per_second": 333.202, + "eval_steps_per_second": 20.825, + "step": 3020000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3020100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3020200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3020300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3020400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3020500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3020600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3020700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0288, + "step": 3020800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3020900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3021000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3021100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3021200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3021300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3021400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3021500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3021600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3021700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3021800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3021900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3022000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3022100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 3022200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3022300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3022400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3022500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3022600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3022700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3022800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3022900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 3023000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3023100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3023200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3023300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 3023400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3023500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3023600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3023700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3023800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3023900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3024000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3024100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3024200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3024300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3024400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3024500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3024600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3024700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3024800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3024900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3025000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3025100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3025200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3025300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 3025400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3025500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3025600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3025700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3025800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3025900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3026000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3026100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3026200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3026300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 3026400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3026500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3026600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3026700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3026800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3026900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0382, + "step": 3027000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3027100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3027200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 3027300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 3027400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3027500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3027600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3027700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3027800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3027900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3028000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3028100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3028200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3028300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3028400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3028500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3028600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3028700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3028800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3028900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3029000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3029100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3029200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3029300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3029400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3029500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3029600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 3029700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0359, + "step": 3029800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 3029900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 3030000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0357, + "step": 3030100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3030200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3030300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 3030400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3030500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3030600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3030700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3030800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3030900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3031000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3031100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3031200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3031300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3031400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3031500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3031600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3031700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3031800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3031900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3032000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3032100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3032200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3032300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3032400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3032500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3032600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3032700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3032800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3032900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3033000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3033100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3033200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3033300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3033400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3033500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3033600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3033700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3033800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3033900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3034000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3034100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3034200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3034300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3034400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3034500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3034600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3034700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 3034800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3034900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3035000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3035100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3035200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3035300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3035400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3035500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3035600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3035700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3035800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3035900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3036000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3036100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3036200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3036300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3036400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3036500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3036600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3036700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3036800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3036900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3037000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3037100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3037200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3037300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3037400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3037500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3037600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3037700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3037800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3037900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3038000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3038100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3038200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3038300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3038400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3038500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3038600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3038700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3038800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3038900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3039000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3039100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3039200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3039300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3039400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 3039500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3039600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3039700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3039800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3039900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3040000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0287017822265625, + "eval_runtime": 3318.2668, + "eval_samples_per_second": 338.949, + "eval_steps_per_second": 21.185, + "step": 3040000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3040100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3040200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3040300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3040400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3040500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3040600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3040700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3040800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3040900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3041000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3041100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3041200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3041300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3041400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3041500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3041600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3041700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3041800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3041900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3042000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3042100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3042200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3042300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3042400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3042500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3042600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 3042700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3042800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3042900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3043000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3043100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3043200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3043300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3043400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3043500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3043600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3043700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3043800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3043900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3044000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3044100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3044200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3044300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3044400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3044500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3044600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3044700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3044800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3044900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3045000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3045100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3045200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3045300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3045400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3045500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3045600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3045700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3045800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3045900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3046000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3046100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3046200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3046300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3046400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3046500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3046600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3046700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3046800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3046900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3047000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3047100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3047200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3047300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3047400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3047500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3047600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3047700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 3047800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3047900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3048000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3048100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3048200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3048300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3048400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3048500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3048600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3048700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3048800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3048900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3049000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3049100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3049200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3049300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3049400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3049500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3049600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3049700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3049800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0411, + "step": 3049900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 3050000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3050100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3050200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3050300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3050400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3050500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3050600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3050700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3050800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3050900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3051000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3051100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3051200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3051300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3051400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3051500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 3051600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3051700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3051800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3051900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 3052000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3052100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3052200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3052300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3052400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3052500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3052600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3052700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3052800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3052900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3053000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3053100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3053200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3053300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3053400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3053500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3053600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3053700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3053800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3053900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3054000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3054100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3054200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3054300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3054400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3054500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3054600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3054700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3054800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3054900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3055000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3055100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3055200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3055300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3055400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3055500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3055600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3055700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3055800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3055900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3056000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3056100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3056200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3056300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3056400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3056500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3056600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3056700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3056800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3056900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3057000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 3057100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3057200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3057300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3057400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3057500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3057600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3057700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3057800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3057900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3058000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3058100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3058200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3058300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3058400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3058500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3058600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3058700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3058800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3058900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3059000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3059100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3059200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3059300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3059400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3059500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3059600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3059700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3059800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3059900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3060000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0281982421875, + "eval_runtime": 3454.829, + "eval_samples_per_second": 325.551, + "eval_steps_per_second": 20.347, + "step": 3060000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3060100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0414, + "step": 3060200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3060300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3060400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3060500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3060600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3060700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3060800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3060900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3061000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3061100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3061200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3061300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3061400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3061500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3061600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0284, + "step": 3061700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3061800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3061900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3062000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3062100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3062200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3062300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3062400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3062500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3062600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3062700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 3062800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3062900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3063000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3063100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3063200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3063300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3063400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3063500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3063600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3063700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3063800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3063900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3064000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3064100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3064200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3064300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3064400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3064500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3064600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3064700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0285, + "step": 3064800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3064900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3065000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3065100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3065200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3065300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3065400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3065500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3065600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3065700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3065800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3065900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3066000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3066100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3066200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3066300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3066400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3066500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3066600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3066700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3066800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3066900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3067000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3067100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3067200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3067300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3067400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3067500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1114, + "step": 3067600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3067700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3067800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3067900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3068000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3068100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3068200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3068300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3068400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3068500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3068600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3068700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3068800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 3068900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3069000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3069100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3069200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3069300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3069400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3069500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3069600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3069700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0288, + "step": 3069800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3069900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3070000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3070100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3070200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3070300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3070400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3070500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3070600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3070700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 3070800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3070900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3071000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3071100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3071200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3071300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3071400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3071500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 3071600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3071700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3071800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3071900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3072000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 3072100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3072200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3072300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3072400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3072500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3072600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3072700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3072800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3072900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3073000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3073100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3073200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3073300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3073400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3073500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3073600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3073700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0341, + "step": 3073800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3073900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0349, + "step": 3074000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3074100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3074200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3074300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3074400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3074500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3074600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3074700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3074800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3074900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3075000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3075100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3075200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3075300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3075400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3075500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3075600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3075700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3075800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3075900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3076000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3076100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3076200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3076300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3076400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3076500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3076600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3076700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3076800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3076900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3077000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0376, + "step": 3077100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3077200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3077300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0459, + "step": 3077400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3077500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 3077600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3077700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3077800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0423, + "step": 3077900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0366, + "step": 3078000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3078100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3078200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3078300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3078400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3078500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3078600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3078700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3078800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3078900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3079000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3079100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 3079200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3079300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3079400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3079500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3079600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3079700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3079800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3079900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0375, + "step": 3080000 + }, + { + "epoch": 0.0, + "eval_loss": 0.03399658203125, + "eval_runtime": 3368.1465, + "eval_samples_per_second": 333.929, + "eval_steps_per_second": 20.871, + "step": 3080000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 3080100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3080200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3080300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3080400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3080500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3080600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3080700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3080800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3080900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3081000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3081100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3081200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3081300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3081400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3081500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3081600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3081700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3081800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3081900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3082000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0282, + "step": 3082100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3082200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3082300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3082400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3082500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3082600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3082700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3082800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3082900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3083000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3083100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 3083200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3083300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3083400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3083500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 3083600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 3083700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3083800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 3083900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3084000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3084100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3084200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3084300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3084400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3084500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3084600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3084700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 3084800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3084900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3085000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3085100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3085200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3085300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 3085400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3085500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3085600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3085700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3085800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3085900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3086000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3086100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3086200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3086300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3086400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3086500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3086600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3086700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3086800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 3086900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3087000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3087100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3087200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3087300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3087400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3087500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3087600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3087700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3087800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3087900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3088000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3088100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3088200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3088300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3088400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3088500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3088600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3088700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3088800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3088900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3089000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3089100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0343, + "step": 3089200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3089300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 3089400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 3089500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3089600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3089700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 3089800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3089900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3090000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 3090100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3090200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3090300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3090400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3090500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3090600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3090700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3090800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3090900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3091000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3091100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3091200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3091300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3091400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3091500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3091600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3091700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3091800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3091900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3092000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3092100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3092200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 3092300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3092400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3092500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3092600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3092700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3092800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3092900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3093000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3093100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3093200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3093300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3093400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3093500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3093600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3093700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3093800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3093900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 3094000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3094100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3094200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3094300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3094400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0284, + "step": 3094500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3094600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0283, + "step": 3094700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3094800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3094900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3095000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 3095100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3095200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3095300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3095400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3095500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3095600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3095700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3095800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3095900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3096000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3096100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3096200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3096300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3096400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3096500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3096600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3096700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3096800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3096900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3097000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3097100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3097200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3097300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3097400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3097500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3097600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3097700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3097800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3097900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3098000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3098100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3098200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3098300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3098400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3098500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3098600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3098700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3098800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3098900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.034, + "step": 3099000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3099100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3099200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3099300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3099400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3099500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3099600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3099700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3099800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3099900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3100000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0283203125, + "eval_runtime": 3298.1173, + "eval_samples_per_second": 341.02, + "eval_steps_per_second": 21.314, + "step": 3100000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3100100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3100200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3100300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3100400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 3100500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 3100600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 3100700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3100800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3100900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3101000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3101100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3101200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 3101300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3101400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3101500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3101600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3101700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3101800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3101900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3102000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3102100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3102200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3102300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3102400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0284, + "step": 3102500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3102600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3102700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3102800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3102900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3103000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3103100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3103200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 3103300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0288, + "step": 3103400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 3103500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 3103600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.1013, + "step": 3103700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0402, + "step": 3103800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 3103900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3104000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3104100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3104200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3104300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3104400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3104500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 3104600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3104700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0335, + "step": 3104800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3104900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3105000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3105100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3105200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3105300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3105400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3105500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3105600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3105700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3105800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3105900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3106000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3106100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3106200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 3106300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3106400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3106500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 3106600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3106700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3106800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3106900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3107000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3107100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3107300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3107400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3107500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3107600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3107700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3107800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3107900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3108000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3108100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3108200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3108300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3108400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3108500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3108600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3108700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3108800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3108900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3109000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3109100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3109200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3109300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3109400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3109500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3109600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3109700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3109800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0476, + "step": 3109900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.04, + "step": 3110000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.035, + "step": 3110100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 3110200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3110300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3110400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3110500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3110600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3110700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3110800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3110900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3111000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3111100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3111200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3111300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3111400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3111500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0353, + "step": 3111600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3111700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3111800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3111900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3112000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3112100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3112200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3112300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3112400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3112500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3112600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3112700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3112800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3112900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3113000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3113100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3113200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3113300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3113400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3113500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3113600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3113700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 3113800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3113900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 3114000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3114100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3114200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3114300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0347, + "step": 3114400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3114500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3114600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3114700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 3114800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 3114900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3115000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3115100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0345, + "step": 3115200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3115300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3115400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3115500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3115600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 3115700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0342, + "step": 3115800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3115900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 3116000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3116100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3116200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3116300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3116400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3116500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3116600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3116700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3116800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3116900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3117000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3117100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3117200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3117300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3117400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3117500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3117600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3117700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3117800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3117900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3118000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3118100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3118200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3118300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3118400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3118500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3118600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3118700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3118800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3118900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3119000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0387, + "step": 3119100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3119200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3119300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0379, + "step": 3119400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3119500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0348, + "step": 3119600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3119700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3119800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3119900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3120000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0281982421875, + "eval_runtime": 3281.7064, + "eval_samples_per_second": 342.725, + "eval_steps_per_second": 21.421, + "step": 3120000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3120100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3120200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3120300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3120400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3120500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3120600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3120700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3120800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3120900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3121000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3121100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3121200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3121300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3121400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3121500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3121600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3121700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3121800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3121900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3122000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3122100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3122200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3122300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3122400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3122500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3122600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3122700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3122800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3122900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3123000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3123100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3123200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3123300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3123400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3123500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3123600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3123700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3123800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3123900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3124000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3124100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3124200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3124300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3124400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3124500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3124600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3124700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0284, + "step": 3124800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3124900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3125000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3125100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3125200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3125300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3125400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3125500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3125600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3125700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3125800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3125900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3126000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3126100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3126200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3126300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3126400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3126500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3126600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3126700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3126800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3126900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3127000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3127100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3127200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 3127300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3127400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3127500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3127600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3127700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3127800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3127900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3128000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3128100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3128200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3128300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3128400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3128500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3128600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3128700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3128800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3128900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3129000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3129100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3129200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3129300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3129400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3129500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3129600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3129700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3129800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3129900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0287, + "step": 3130000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3130100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3130200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3130300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3130400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3130500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3130600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3130700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3130800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3130900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3131000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3131100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3131200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3131300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3131400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3131500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3131600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3131700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3131800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3131900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 3132000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3132100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3132200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3132300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3132400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3132500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3132600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3132700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3132800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3132900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3133000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3133100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3133200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3133300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3133400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3133500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.036, + "step": 3133600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3133700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3133800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3133900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3134000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3134100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3134200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3134300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3134400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3134500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3134600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3134700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3134800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3134900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3135000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3135100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3135200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3135300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3135400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3135500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3135600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3135700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3135800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3135900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3136000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3136100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3136200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3136300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3136400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3136500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3136600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3136700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3136800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3136900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3137000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3137100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3137200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3137300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3137400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3137500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3137600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3137700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3137800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3137900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3138000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3138100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3138200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3138300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3138400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3138500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3138600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3138700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3138800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 3138900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3139000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3139100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3139200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3139300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3139400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3139500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3139600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3139700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3139800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3139900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0328, + "step": 3140000 + }, + { + "epoch": 0.0, + "eval_loss": 0.028167724609375, + "eval_runtime": 3373.5486, + "eval_samples_per_second": 333.395, + "eval_steps_per_second": 20.837, + "step": 3140000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3140100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3140200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3140300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3140400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3140500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3140600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3140700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3140800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3140900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3141000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3141100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3141200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3141300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3141400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3141500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3141600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3141700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3141800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3141900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3142000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3142100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3142200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3142300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3142400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3142500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3142600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3142700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0338, + "step": 3142800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0355, + "step": 3142900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3143000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3143100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3143200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3143300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3143400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3143500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3143600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3143700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3143800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3143900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3144000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3144100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3144200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3144300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3144400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3144500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3144600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3144700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3144800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3144900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3145000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3145100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3145200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3145300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3145400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3145500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 3145600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3145700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3145800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3145900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3146000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3146100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3146200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3146300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3146400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3146500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3146600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3146700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3146800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3146900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3147000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3147100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3147200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3147300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3147400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3147500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3147600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3147700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3147800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3147900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3148000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3148100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3148200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3148300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3148400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3148500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3148600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3148700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3148800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3148900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3149000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3149100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3149200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3149300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.029, + "step": 3149400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3149500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3149600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3149700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3149800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3149900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3150000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3150100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3150200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3150300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3150400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3150500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3150600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3150700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 3150800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3150900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3151000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3151100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3151200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3151300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3151400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3151500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3151600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3151700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3151800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3151900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3152000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3152100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0337, + "step": 3152200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3152300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3152400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3152500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 3152600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3152700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3152800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3152900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3153000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3153100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3153200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3153300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3153400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3153500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3153600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3153700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3153800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3153900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3154000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3154100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3154200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3154300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3154400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3154500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3154600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3154700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3154800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3154900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3155000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0332, + "step": 3155100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0327, + "step": 3155200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3155300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3155400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3155500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3155600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3155700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0354, + "step": 3155800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3155900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3156000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3156100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3156200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3156300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3156400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3156500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3156600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3156700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3156800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3156900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3157000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3157100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3157200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3157300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3157400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3157500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3157600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3157700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3157800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3157900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3158000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3158100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3158200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3158300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3158400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3158500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3158600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3158700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3158800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3158900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3159000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3159100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3159200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3159300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3159400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3159500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3159600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3159700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3159800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3159900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3160000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02813720703125, + "eval_runtime": 3409.7011, + "eval_samples_per_second": 329.86, + "eval_steps_per_second": 20.616, + "step": 3160000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3160100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3160200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3160300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3160400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3160500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3160600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3160700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3160800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3160900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3161000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3161100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3161200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3161300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3161400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3161500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3161600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3161700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3161800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3161900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3162000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0329, + "step": 3162100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3162200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3162300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0364, + "step": 3162400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3162500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3162600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3162700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3162800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3162900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3163000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3163100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3163200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3163300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3163400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3163500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3163600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3163700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3163800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3163900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3164000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3164100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3164200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3164300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3164400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3164500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3164600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3164700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3164800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3164900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3165000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3165100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3165200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3165300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3165400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3165500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0319, + "step": 3165600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3165700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3165800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3165900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3166000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3166100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3166200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0285, + "step": 3166300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3166400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3166500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3166600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0305, + "step": 3166700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3166800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3166900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3167000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3167100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3167200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3167300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3167400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3167500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3167600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0326, + "step": 3167700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3167800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3167900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3168000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0333, + "step": 3168100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3168200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3168300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3168400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3168500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3168600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3168700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3168800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0368, + "step": 3168900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3169000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3169100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3169200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3169300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3169400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3169500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3169600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3169700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3169800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3169900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0325, + "step": 3170000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3170100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3170200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3170300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0289, + "step": 3170400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3170500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3170600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3170700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0293, + "step": 3170800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3170900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3171000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3171100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3171200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3171300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0324, + "step": 3171400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3171500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3171600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3171700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3171800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3171900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3172000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0292, + "step": 3172100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3172200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3172300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0346, + "step": 3172400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3172500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3172600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3172700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0339, + "step": 3172800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3172900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0291, + "step": 3173000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3173100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0304, + "step": 3173200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3173300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3173400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3173500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3173600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3173700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3173800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3173900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3174000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3174100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3174200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3174300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3174400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0323, + "step": 3174500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3174600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3174700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3174800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0322, + "step": 3174900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3175000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3175100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3175200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.031, + "step": 3175300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3175400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3175500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0331, + "step": 3175600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0321, + "step": 3175700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0336, + "step": 3175800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3175900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3176000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3176100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3176200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3176300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0334, + "step": 3176400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3176500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0301, + "step": 3176600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0308, + "step": 3176700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3176800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3176900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0303, + "step": 3177000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0316, + "step": 3177100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0318, + "step": 3177200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3177300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0298, + "step": 3177400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3177500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0302, + "step": 3177600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3177700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0296, + "step": 3177800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0294, + "step": 3177900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3178000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3178100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3178200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0299, + "step": 3178300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0315, + "step": 3178400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3178500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3178600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0313, + "step": 3178700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0309, + "step": 3178800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.033, + "step": 3178900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.032, + "step": 3179000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0311, + "step": 3179100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0317, + "step": 3179200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0297, + "step": 3179300 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0314, + "step": 3179400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0295, + "step": 3179500 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0307, + "step": 3179600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0306, + "step": 3179700 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3179800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.0312, + "step": 3179900 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001, + "loss": 0.03, + "step": 3180000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0281524658203125, + "eval_runtime": 3361.7471, + "eval_samples_per_second": 334.565, + "eval_steps_per_second": 20.911, + "step": 3180000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0303, + "step": 3180100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0295, + "step": 3180200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.031, + "step": 3180300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.029, + "step": 3180400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0295, + "step": 3180500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0302, + "step": 3180600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0304, + "step": 3180700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3180800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0299, + "step": 3180900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0303, + "step": 3181000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0292, + "step": 3181100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3181200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.03, + "step": 3181300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3181400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.029, + "step": 3181500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0286, + "step": 3181600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.032, + "step": 3181700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3181800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0296, + "step": 3181900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3182000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3182100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0297, + "step": 3182200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3182300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3182400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0291, + "step": 3182500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3182600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3182700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3182800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0294, + "step": 3182900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0293, + "step": 3183000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3183100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3183200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0293, + "step": 3183300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3183400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0286, + "step": 3183500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0299, + "step": 3183600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3183700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3183800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3183900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3184000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3184100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0286, + "step": 3184200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3184300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3184400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0289, + "step": 3184500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.029, + "step": 3184600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3184700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3184800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3184900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3185000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3185100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3185200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0294, + "step": 3185300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3185400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3185500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3185600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3185700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3185800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0289, + "step": 3185900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3186000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0293, + "step": 3186100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3186200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3186300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3186400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3186500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3186600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3186700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3186800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3186900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.029, + "step": 3187000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3187100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3187200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3187300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3187400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3187500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0294, + "step": 3187600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3187700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3187800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3187900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3188000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3188100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3188200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3188300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3188400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3188500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3188600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3188700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3188800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3188900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3189000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3189100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3189200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3189300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3189400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3189500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3189600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3189700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3189800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3189900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3190000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0305, + "step": 3190100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3190200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3190300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3190400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0288, + "step": 3190500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3190600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3190700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.029, + "step": 3190800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3190900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3191000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3191100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3191200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3191300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3191400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3191500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3191600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0293, + "step": 3191700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3191800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3191900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3192000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3192100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3192200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3192300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3192400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3192500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3192600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3192700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3192800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3192900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3193000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3193100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3193200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3193300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3193400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3193500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3193600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0291, + "step": 3193700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3193800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3193900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3194000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3194100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3194200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3194300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3194400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3194500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0289, + "step": 3194600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3194700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3194800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3194900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3195000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3195100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3195200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3195300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3195400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3195500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3195600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3195700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.029, + "step": 3195800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3195900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3196000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3196100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3196200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3196300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3196400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3196500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3196600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3196700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3196800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3196900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3197000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3197100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3197200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3197300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3197400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3197500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3197600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3197700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3197800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3197900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3198000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3198100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3198200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3198300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3198400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3198500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3198600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3198700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3198800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3198900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3199000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3199100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3199200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0291, + "step": 3199300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3199400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3199500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0295, + "step": 3199600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0291, + "step": 3199700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3199800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3199900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3200000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02618408203125, + "eval_runtime": 3400.2762, + "eval_samples_per_second": 330.774, + "eval_steps_per_second": 20.674, + "step": 3200000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3200100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3200200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3200300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3200400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3200500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3200600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3200700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3200800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3200900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3201000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3201100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3201200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3201300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3201400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3201500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3201600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3201700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3201800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3201900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3202000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3202100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3202200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3202300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3202400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3202500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3202600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3202700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3202800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3202900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3203000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3203100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3203200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3203300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3203400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3203500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3203600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3203700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3203800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3203900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3204000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3204100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3204200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3204300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3204400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3204500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3204600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3204700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3204800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3204900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3205000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3205100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3205200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3205300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3205400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3205500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3205600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3205700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3205800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3205900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3206000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3206100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3206200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0286, + "step": 3206300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3206400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3206500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3206600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3206700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3206800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3206900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3207000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3207100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3207200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3207300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3207400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3207500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3207600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3207700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3207800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3207900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3208000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3208100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3208200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3208300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3208400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3208500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3208600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3208700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3208800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3208900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3209000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3209100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3209200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3209300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3209400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3209500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3209600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3209700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3209800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3209900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3210000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3210100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3210200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3210300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3210400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3210500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3210600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3210700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3210800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3210900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3211000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3211100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3211200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3211300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3211400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3211500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3211600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3211700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3211800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3211900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3212000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3212100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3212200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3212300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3212400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3212500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3212600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3212700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3212800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3212900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3213000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3213100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3213200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3213300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3213400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3213500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3213600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3213700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3213800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3213900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3214000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3214100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3214200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.028, + "step": 3214300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3214400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3214500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3214600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3214700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3214800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3214900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3215000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3215100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3215200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3215300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3215400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3215500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3215600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3215700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3215800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3215900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3216000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3216100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3216200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3216300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3216400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3216500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3216600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3216700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3216800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3216900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3217000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3217100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3217200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3217300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3217400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3217500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3217600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3217700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3217800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3217900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3218000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3218100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3218200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3218300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3218400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3218500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3218600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3218700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3218800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3218900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3219000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3219100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3219200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3219300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3219400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3219500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3219600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3219700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3219800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3219900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3220000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02593994140625, + "eval_runtime": 3278.6598, + "eval_samples_per_second": 343.044, + "eval_steps_per_second": 21.44, + "step": 3220000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3220100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3220200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3220300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3220400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3220500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3220600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3220700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3220800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3220900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3221000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3221100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3221200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3221300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3221400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3221500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3221600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3221700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3221800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3221900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3222000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3222100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3222200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3222300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3222400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3222500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3222600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3222700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3222800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3222900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3223000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3223100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3223200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3223300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3223400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3223500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3223600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3223700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3223800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3223900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3224000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3224100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3224200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3224300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3224400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3224500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3224600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3224700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3224800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3224900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3225000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3225100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3225200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3225300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3225400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3225500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3225600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3225700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3225800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3225900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3226000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3226100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3226200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3226300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3226400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3226500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3226600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3226700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3226800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3226900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3227000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3227100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3227200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3227300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3227400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3227500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3227600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3227700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3227800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3227900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3228000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3228100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3228200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3228300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3228400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3228500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3228600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3228700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3228800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3228900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3229000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3229100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3229200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3229300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3229400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3229500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3229600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3229700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3229800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3229900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3230000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3230100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3230200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3230300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3230400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3230500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3230600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3230700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3230800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3230900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3231000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3231100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3231200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3231300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3231400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3231500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3231600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3231700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3231800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3231900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3232000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3232100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3232200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3232300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3232400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3232500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3232600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3232700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3232800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3232900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3233000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3233100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3233200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3233300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3233400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3233500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3233600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3233700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3233800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3233900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3234000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3234100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3234200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3234300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3234400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3234500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3234600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0286, + "step": 3234700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3234800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3234900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3235000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3235100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3235200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3235300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3235400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3235500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3235600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3235700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3235800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3235900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3236000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3236100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3236200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3236300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3236400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3236500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3236600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3236700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3236800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3236900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3237000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3237100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3237200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3237300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3237400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3237500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3237600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3237700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3237800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3237900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3238000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3238100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3238200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3238300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3238400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3238500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3238600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3238700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3238800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3238900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3239000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3239100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3239200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3239300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3239400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3239500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3239600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3239700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3239800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3239900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3240000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0257415771484375, + "eval_runtime": 3195.2132, + "eval_samples_per_second": 352.002, + "eval_steps_per_second": 22.0, + "step": 3240000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3240100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3240200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3240300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3240400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3240500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3240600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3240700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3240800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3240900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3241000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3241100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3241200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3241300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3241400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3241500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3241600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3241700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3241800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3241900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3242000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3242100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3242200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3242300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3242400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3242500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3242600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3242700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3242800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3242900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3243000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3243100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3243200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3243300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3243400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3243500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3243600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3243700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3243800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3243900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3244000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3244100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3244200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3244300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3244400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3244500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3244600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3244700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3244800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3244900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3245000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3245100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3245200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3245300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3245400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3245500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3245600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3245700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3245800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3245900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3246000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3246100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3246200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3246300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3246400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3246500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3246600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3246700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3246800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3246900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3247000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3247100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3247200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3247300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3247400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3247500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3247600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3247700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3247800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3247900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3248000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3248100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3248200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3248300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3248400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3248500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3248600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3248700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3248800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3248900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3249000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3249100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3249200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3249300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3249400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3249500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3249600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3249700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3249800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3249900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3250000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3250100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3250200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3250300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3250400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3250500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3250600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3250700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3250800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3250900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3251000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3251100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3251200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3251300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3251400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3251500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3251600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3251700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3251800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3251900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3252000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3252100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3252200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3252300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3252400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3252500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3252600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3252700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3252800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3252900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3253000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3253100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3253200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3253300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3253400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3253500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3253600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3253700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3253800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3253900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3254000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3254100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3254200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3254300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3254400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3254500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3254600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3254700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3254800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3254900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3255000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3255100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3255200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3255300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3255400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3255500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3255600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3255700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3255800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3255900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3256000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3256100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3256200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3256300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3256400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3256500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3256600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3256700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3256800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3256900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3257000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3257100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3257200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3257300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3257400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3257500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3257600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3257700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3257800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3257900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3258000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3258100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3258200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3258300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3258400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3258500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3258600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3258700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3258800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3258900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3259000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3259100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3259200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3259300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3259400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3259500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3259600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3259700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3259800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3259900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3260000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0256500244140625, + "eval_runtime": 3120.5744, + "eval_samples_per_second": 360.422, + "eval_steps_per_second": 22.527, + "step": 3260000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3260100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3260200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3260300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3260400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3260500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3260600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3260700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3260800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3260900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3261000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3261100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3261200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3261300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3261400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3261500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3261600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3261700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3261800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3261900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3262000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3262100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3262200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3262300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3262400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3262500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3262600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3262700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3262800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3262900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3263000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3263100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3263200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3263300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3263400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3263500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3263600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3263700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3263800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3263900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3264000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3264100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3264200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3264300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3264400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3264500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3264600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3264700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3264800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3264900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3265000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3265100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3265200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3265300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3265400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3265500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3265600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3265700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3265800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3265900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3266000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3266100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3266200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3266300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3266400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3266500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3266600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3266700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3266800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3266900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3267000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3267100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3267200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3267300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3267400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3267500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3267600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3267700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3267800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3267900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3268000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3268100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3268200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3268300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3268400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3268500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3268600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3268700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3268800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3268900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3269000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3269100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3269200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3269300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3269400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3269500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3269600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3269700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3269800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3269900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3270000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3270100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3270200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3270300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3270400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3270500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3270600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3270700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3270800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3270900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3271000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3271100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3271200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3271300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3271400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3271500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3271600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3271700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3271800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3271900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3272000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3272100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3272200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3272300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3272400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3272500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3272600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3272700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3272800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3272900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3273000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3273100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3273200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3273300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3273400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3273500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3273600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3273700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3273800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3273900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3274000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3274100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3274200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3274300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3274400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3274500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3274600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3274700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3274800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3274900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3275000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3275100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3275200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3275300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3275400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3275500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3275600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3275700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3275800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3275900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3276000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3276100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3276200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3276300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3276400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3276500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3276600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3276700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3276900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3277000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3277100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3277200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3277300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3277400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3277500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3277600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3277700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3277800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3277900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3278000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3278100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3278200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3278300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3278400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3278500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3278600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3278700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3278800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3278900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3279000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3279100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3279200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3279300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3279400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3279500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3279600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3279700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3279800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3279900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3280000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0255584716796875, + "eval_runtime": 3272.9094, + "eval_samples_per_second": 343.646, + "eval_steps_per_second": 21.478, + "step": 3280000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3280100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3280200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3280300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3280400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3280500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3280600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3280700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3280800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3280900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3281000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3281100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3281200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3281300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3281400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3281500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3281600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3281700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3281800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3281900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3282000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3282100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3282200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3282300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3282400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3282500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3282600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3282700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3282800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0283, + "step": 3282900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3283000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3283100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3283200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3283300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3283400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3283500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3283600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3283700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3283800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3283900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3284000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3284100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3284200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3284300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3284400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3284500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3284600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3284700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3284800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3284900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3285000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3285100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3285200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3285300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3285400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3285500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3285600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3285700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3285800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3285900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3286000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3286100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3286200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3286300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3286400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3286500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3286600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3286700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3286800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3286900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3287000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3287100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3287200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3287300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3287400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3287500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3287600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3287700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3287800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3287900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3288000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3288100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3288200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3288300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3288400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3288500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3288600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3288700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3288800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3288900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3289000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3289100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3289200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3289300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3289400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3289500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3289600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3289700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3289800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3289900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3290000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3290100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3290200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3290300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3290400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3290500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3290600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3290700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3290800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3290900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3291000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3291100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3291200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3291300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3291400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3291500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3291600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3291700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3291800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3291900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3292000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3292100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3292200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3292300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3292400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3292500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3292600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3292700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3292800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3292900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3293000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3293100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3293200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3293300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3293400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3293500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3293600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3293700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3293800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3293900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3294000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3294100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3294200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3294300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3294400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3294500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3294600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3294700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3294800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3294900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3295000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3295100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3295200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3295300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3295400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3295500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3295600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3295700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3295800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3295900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3296000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3296100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3296200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3296300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3296400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3296500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3296600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3296700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3296800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3296900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3297000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3297100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3297200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3297300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3297400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3297500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3297600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3297700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3297800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3297900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3298000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3298100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3298200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3298300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3298400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3298500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3298600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3298700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3298800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3298900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3299000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3299100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3299200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3299300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3299400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3299500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3299600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3299700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3299800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3299900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3300000 + }, + { + "epoch": 0.0, + "eval_loss": 0.025482177734375, + "eval_runtime": 3277.7006, + "eval_samples_per_second": 343.144, + "eval_steps_per_second": 21.447, + "step": 3300000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3300100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3300200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3300300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3300400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3300500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3300600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3300700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3300800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3300900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3301000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3301100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3301200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3301300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3301400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3301500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3301600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3301700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3301800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3301900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3302000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3302100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3302200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3302300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3302400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3302500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3302600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3302700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3302800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3302900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3303000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3303100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3303200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3303300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3303400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3303500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3303600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3303700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3303800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3303900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3304000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3304100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3304200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3304300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3304400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3304500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3304600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3304700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3304800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3304900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3305000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3305100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3305200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3305300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3305400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3305500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3305600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0282, + "step": 3305700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3305800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3305900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3306000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3306100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3306200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3306300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3306400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3306500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3306600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3306700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3306800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3306900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3307000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3307100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3307200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3307300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3307400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3307500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3307600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3307700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3307800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3307900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3308000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3308100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3308200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3308300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3308400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3308500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3308600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3308700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3308800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3308900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3309000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3309100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3309200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3309300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3309400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3309500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3309600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3309700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3309800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3309900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3310000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3310100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3310200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3310300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3310400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3310500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3310600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3310700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3310800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3310900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3311000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3311100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3311200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3311300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3311400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3311500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3311600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3311700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3311800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3311900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3312000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3312100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3312200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3312300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3312400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3312500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3312600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3312700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3312800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3312900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3313000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3313100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3313200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3313300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3313400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3313500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3313600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3313700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3313800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3313900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3314000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3314100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3314200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3314300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3314400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3314500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3314600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3314700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3314800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3314900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3315000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3315100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3315200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3315300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3315400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3315500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3315600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3315700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3315800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3315900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3316000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3316100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3316200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3316300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3316400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3316500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3316600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3316700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3316800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3316900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3317000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3317100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3317200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3317300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3317400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3317500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3317600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3317700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3317800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3317900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3318000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3318100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3318200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0279, + "step": 3318300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3318400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3318500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3318600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3318700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3318800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3318900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3319000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3319100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3319200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3319300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3319400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3319500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3319600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3319700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3319800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0287, + "step": 3319900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3320000 + }, + { + "epoch": 0.0, + "eval_loss": 0.025390625, + "eval_runtime": 3295.1049, + "eval_samples_per_second": 341.331, + "eval_steps_per_second": 21.333, + "step": 3320000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3320100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3320200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3320300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3320400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3320500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3320600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3320700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3320800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3320900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3321000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3321100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3321200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3321300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3321400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3321500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3321600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3321700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3321800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3321900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3322000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3322100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3322200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3322300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3322400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3322500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3322600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3322700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3322800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3322900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3323000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3323100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3323200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3323300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3323400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3323500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0285, + "step": 3323600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3323700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3323800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3323900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3324000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3324100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3324200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3324300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3324400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3324500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3324600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3324700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3324800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3324900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3325000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3325100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3325200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3325300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3325400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3325500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3325600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3325700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3325800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3325900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3326000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3326100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3326200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3326300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3326400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3326500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3326600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3326700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3326800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3326900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3327000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3327100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3327200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3327300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3327400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3327500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3327600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3327700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3327800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3327900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3328000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3328100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3328200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3328300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3328400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3328500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3328600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3328700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3328800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3328900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3329000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3329100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3329200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3329300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3329400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3329500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3329600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3329700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3329800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3329900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3330000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3330100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3330200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3330300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3330400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3330500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3330600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3330700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3330800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3330900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3331000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3331100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3331200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3331300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3331400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3331500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3331600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3331700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3331800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3331900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3332000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3332100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3332200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3332300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3332400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3332500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3332600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3332700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3332800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3332900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3333000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3333100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3333200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3333300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3333400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3333500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3333600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3333700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3333800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3333900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3334000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3334100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3334200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3334300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3334400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3334500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3334600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3334700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3334800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3334900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3335000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3335100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3335200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3335300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3335400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3335500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3335600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3335700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3335800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3335900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3336000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3336100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3336200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3336300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3336400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3336500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3336600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3336700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3336800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3336900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3337000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3337100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3337200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3337300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3337400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3337500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3337600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3337700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3337800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3337900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3338000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3338100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3338200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3338300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3338400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3338500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3338600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3338700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3338800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3338900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3339000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3339100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3339200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3339300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3339400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3339500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3339600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3339700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3339800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3339900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3340000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02532958984375, + "eval_runtime": 3251.3678, + "eval_samples_per_second": 345.923, + "eval_steps_per_second": 21.62, + "step": 3340000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3340100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3340200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3340300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3340400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3340500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3340600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3340700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3340800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3340900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3341000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3341100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3341200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3341300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3341400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3341500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3341600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3341700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3341800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3341900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3342000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3342100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3342200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3342300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3342400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3342500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3342600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3342700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3342800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3342900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3343000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3343100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3343200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3343300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3343400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3343500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3343600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3343700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3343800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3343900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3344000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3344100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3344200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3344300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3344400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3344500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3344600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3344700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3344800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3344900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3345000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3345100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3345200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3345300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3345400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3345500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3345600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3345700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3345800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3345900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3346000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3346100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3346200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3346300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3346400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3346500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3346600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3346700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3346800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3346900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0277, + "step": 3347000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3347100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3347200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3347300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3347400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3347500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3347600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3347700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3347800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3347900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3348000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3348100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3348200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3348300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3348400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3348500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3348600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3348700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3348800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3348900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3349000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3349100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3349200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3349300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3349400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3349500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3349600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3349700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3349800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3349900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3350000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3350100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3350200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3350300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3350400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3350500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3350600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3350700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3350800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3350900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3351000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3351100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3351200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3351300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3351400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3351500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3351600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3351700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3351800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3351900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3352000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3352100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3352200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3352300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3352400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3352500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3352600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3352700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3352800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3352900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3353000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3353100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3353200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3353300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3353400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3353500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3353600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3353700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3353800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3353900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3354000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3354100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3354200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3354300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3354400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3354500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3354600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3354700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3354800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3354900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3355000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3355100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3355200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3355300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3355400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3355500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3355600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3355700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3355800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3355900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3356000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3356100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3356200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3356300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3356400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3356500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3356600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3356700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3356800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3356900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3357000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3357100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3357200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3357300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3357400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3357500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3357600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3357700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3357800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3357900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3358000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3358100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3358200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3358300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3358400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3358500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3358600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3358700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3358800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3358900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3359000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3359100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3359200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3359300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3359400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3359500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3359600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3359700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3359800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3359900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3360000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0253448486328125, + "eval_runtime": 3387.197, + "eval_samples_per_second": 332.051, + "eval_steps_per_second": 20.753, + "step": 3360000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3360100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3360200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3360300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3360400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3360500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3360600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3360700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3360800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3360900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3361000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3361100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3361200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3361300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3361400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3361500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3361600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3361700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3361800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3361900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3362000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3362100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3362200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3362300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3362400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3362500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3362600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3362700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3362800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3362900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3363000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3363100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3363200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3363300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3363400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3363500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3363600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3363700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3363800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3363900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3364000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3364100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3364200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3364300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3364400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3364500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3364600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3364700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3364800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3364900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3365000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3365100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3365200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3365300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3365400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3365500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3365600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3365700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3365800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3365900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3366000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3366100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3366200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3366300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3366400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3366500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3366600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3366700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3366800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3366900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3367000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3367100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3367200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3367300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3367400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3367500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3367600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3367700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3367800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3367900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3368000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3368100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3368200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3368300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3368400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3368500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3368600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3368700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3368800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3368900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3369000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3369100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3369200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3369300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3369400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3369500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3369600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3369700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3369800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3369900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3370000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3370100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3370200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3370300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3370400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3370500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3370600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3370700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3370800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3370900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3371000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3371100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3371200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3371300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3371400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3371500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3371600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3371700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3371800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3371900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3372000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3372100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3372200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3372300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3372400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3372500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3372600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3372700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3372800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3372900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3373000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3373100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3373200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3373300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3373400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3373500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3373600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3373700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3373800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3373900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3374000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3374100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3374200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3374300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3374400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3374500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3374600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3374700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3374800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3374900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0281, + "step": 3375000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3375100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3375200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3375300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3375400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3375500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3375600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3375700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3375800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3375900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3376000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3376100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3376200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3376300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3376400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3376500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3376600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3376700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3376800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3376900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3377000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3377100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3377200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3377300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3377400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3377500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3377600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3377700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3377800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3377900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3378000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3378100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3378200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3378300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3378400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3378500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3378600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3378700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3378800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3378900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3379000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3379100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3379200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3379300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3379400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3379500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3379600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3379700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3379800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3379900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3380000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0252685546875, + "eval_runtime": 3642.4195, + "eval_samples_per_second": 308.785, + "eval_steps_per_second": 19.299, + "step": 3380000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3380100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3380200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3380300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3380400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3380500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3380600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3380700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3380800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3380900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3381000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3381100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3381200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3381300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3381400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3381500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3381600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3381700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3381800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3381900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3382000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3382100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3382200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3382300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3382400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3382500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3382600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3382700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3382800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3382900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3383000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3383100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3383200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3383300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3383400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3383500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3383600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3383700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3383800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3383900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3384000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3384100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3384200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3384300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3384400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3384500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3384600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3384700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3384800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3384900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3385000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3385100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3385200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3385300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3385400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3385500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3385600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3385700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3385800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3385900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3386000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3386100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0278, + "step": 3386200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3386300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3386400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3386500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3386600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3386700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3386800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3386900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3387000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3387100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3387200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3387300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3387400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3387500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3387600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3387700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3387800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3387900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3388000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3388100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3388200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3388300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3388400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3388500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3388600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3388700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3388800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3388900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3389000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3389100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3389200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3389300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3389400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3389500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3389600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3389700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3389800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3389900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3390000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3390100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3390200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3390300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3390400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3390500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3390600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3390700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3390800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3390900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3391000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3391100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3391200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3391300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3391400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3391500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3391600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3391700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3391800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3391900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3392000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3392100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3392200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3392300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3392400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3392500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3392600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3392700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3392800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3392900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3393000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3393100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3393200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3393300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3393400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3393500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3393600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3393700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3393800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3393900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3394000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3394100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3394200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3394300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3394400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3394500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3394600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3394700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3394800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3394900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3395000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3395100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3395200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3395300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3395400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3395500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3395600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3395700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3395800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3395900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3396000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3396100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3396200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3396300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3396400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3396500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3396600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3396700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3396800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3396900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3397000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3397100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3397200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3397300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3397400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3397500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3397600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3397700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3397800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3397900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3398000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3398100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3398200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3398300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3398400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3398500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3398600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3398700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3398800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3398900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3399000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3399100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3399200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3399300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3399400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3399500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3399600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3399700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3399800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3399900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3400000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0252838134765625, + "eval_runtime": 3529.8352, + "eval_samples_per_second": 318.633, + "eval_steps_per_second": 19.915, + "step": 3400000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3400100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3400200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3400300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3400400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3400500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3400600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3400700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3400800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3400900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3401000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3401100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3401200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3401300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3401400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3401500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3401600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3401700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3401800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3401900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3402000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3402100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3402200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3402300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3402400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3402500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3402600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3402700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3402800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3402900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3403000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3403100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3403200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3403300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3403400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3403500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3403600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3403700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3403800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3403900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3404000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3404100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3404200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3404300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3404400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3404500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3404600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3404700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3404800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3404900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3405000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3405100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3405200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3405300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3405400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3405500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3405600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3405700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3405800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3405900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3406000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3406100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3406200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3406300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3406400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3406500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3406600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3406700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3406800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3406900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3407000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3407100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3407200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3407300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3407400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3407500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3407600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3407700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3407800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3407900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3408000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3408100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3408200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3408300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3408400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3408500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3408600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3408700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3408800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3408900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3409000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3409100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3409200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3409300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3409400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3409500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3409600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3409700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3409800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3409900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3410000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3410100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3410200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3410300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3410400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3410500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3410600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3410700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3410800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3410900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3411000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0237, + "step": 3411100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3411200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3411300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3411400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3411500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3411600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3411700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3411800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3411900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3412000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3412100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3412200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3412300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3412400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3412500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3412600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3412700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3412800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3412900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3413000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3413100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3413200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3413300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3413400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3413500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3413600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3413700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3413800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3413900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3414000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3414100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3414200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3414300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3414400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3414500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3414600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3414700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3414800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3414900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3415000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3415100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3415200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3415300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3415400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3415500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3415600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3415700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3415800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3415900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3416000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3416100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3416200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3416300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3416400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3416500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3416600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3416700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3416800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3416900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3417000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3417100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3417200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3417300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3417400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3417500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3417600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3417700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3417800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3417900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3418000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3418100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3418200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3418300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3418400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3418500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3418600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3418700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3418800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3418900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3419000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3419100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3419200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3419300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3419400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3419500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3419600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3419700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3419800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3419900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3420000 + }, + { + "epoch": 0.0, + "eval_loss": 0.025238037109375, + "eval_runtime": 3246.5247, + "eval_samples_per_second": 346.439, + "eval_steps_per_second": 21.653, + "step": 3420000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3420100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3420200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3420300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3420400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3420500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3420600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3420700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3420800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3420900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3421000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3421100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3421200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3421300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3421400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3421500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3421600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3421700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3421800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3421900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3422000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3422100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3422200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3422300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3422400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3422500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3422600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3422700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3422800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3422900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3423000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3423100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3423200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3423300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3423400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3423500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3423600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3423700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3423800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3423900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3424000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3424100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3424200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3424300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3424400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3424500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3424600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3424700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3424800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3424900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3425000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3425100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3425200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3425300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3425400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3425500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3425600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0276, + "step": 3425700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3425800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3425900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3426000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3426100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3426200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3426300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3426400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3426500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3426600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3426700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3426800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3426900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3427000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3427100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3427200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3427300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3427400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3427500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3427600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3427700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3427800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3427900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3428000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3428100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3428200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3428300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3428400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3428500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3428600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3428700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3428800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3428900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3429000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3429100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3429200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3429300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3429400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3429500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3429600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3429700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3429800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3429900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3430000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3430100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3430200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3430300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3430400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3430500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3430600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3430700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3430800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3430900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3431000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3431100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3431200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3431300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3431400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3431500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3431600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3431700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3431800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3431900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3432000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3432100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3432200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3432300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3432400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3432500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3432600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3432700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3432800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3432900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3433000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3433100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3433200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3433300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3433400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3433500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3433600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3433700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3433800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3433900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3434000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3434100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3434200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3434300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3434400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3434500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3434600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3434700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3434800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3434900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3435000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3435100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3435200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3435300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3435400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3435500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3435600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3435700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3435800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3435900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3436000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3436100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3436200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3436300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3436400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3436500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3436600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3436700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3436800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3436900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3437000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3437100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3437200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3437300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3437400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3437500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3437600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3437700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0296, + "step": 3437800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3437900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3438000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3438100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3438200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3438300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3438400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3438500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3438600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3438700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3438800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3438900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3439000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3439100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3439200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3439300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3439400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3439500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3439600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3439700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3439800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3439900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3440000 + }, + { + "epoch": 0.0, + "eval_loss": 0.025177001953125, + "eval_runtime": 3132.3556, + "eval_samples_per_second": 359.066, + "eval_steps_per_second": 22.442, + "step": 3440000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3440100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3440200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3440300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3440400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3440500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3440600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3440700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3440800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3440900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3441000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3441100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3441200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3441300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3441400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3441500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3441600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3441700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3441800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3441900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3442000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3442100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3442200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3442300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3442400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3442500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3442600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3442700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3442800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3442900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3443000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3443100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3443200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3443300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3443400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3443500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3443600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3443700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3443800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3443900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3444000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3444100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3444200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3444300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3444400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3444500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3444600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3444700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3444800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3444900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3445000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3445100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3445200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3445300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3445400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3445500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3445600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3445700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3445800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3445900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3446000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3446100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3446200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3446300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3446400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3446500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3446600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3446700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3446800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3446900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3447000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3447100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3447200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3447300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3447400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3447500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3447600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3447700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3447800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3447900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3448000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3448100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3448200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3448300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3448400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3448500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3448600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3448700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3448800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3448900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3449000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3449100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3449200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3449300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3449400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3449500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3449600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3449700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3449800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3449900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3450000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3450100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3450200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3450300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3450400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3450500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3450600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3450700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3450800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3450900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3451000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3451100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3451200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3451300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3451400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3451500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3451600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3451700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3451800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3451900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3452000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3452100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3452200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3452300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3452400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3452500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3452600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3452700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3452800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3452900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3453000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3453100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3453200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3453300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3453400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3453500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3453600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3453700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3453800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3453900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3454000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3454100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3454200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3454300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3454400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3454500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3454600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3454700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3454800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3454900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3455000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3455100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3455200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3455300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0241, + "step": 3455400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3455500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3455600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3455700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3455800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3455900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3456000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3456100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3456200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3456300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3456400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3456500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3456600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3456700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3456800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3456900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3457000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3457100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3457200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3457300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3457400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3457500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3457600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3457700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3457800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3457900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3458000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3458100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3458200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3458300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3458400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3458500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3458600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3458700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3458800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3458900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3459000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3459100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3459200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3459300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3459400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3459500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3459600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3459700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3459800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3459900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3460000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0251617431640625, + "eval_runtime": 3022.7979, + "eval_samples_per_second": 372.08, + "eval_steps_per_second": 23.255, + "step": 3460000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3460100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3460200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3460300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3460400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3460500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3460600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3460700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3460800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3460900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3461000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3461100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3461200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3461300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3461400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3461500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3461600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3461700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3461800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3461900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3462000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3462100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3462200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3462300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3462400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3462500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3462600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3462700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3462800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3462900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3463000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3463100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3463200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3463300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3463400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3463500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3463600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3463700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3463800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3463900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3464000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3464100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3464200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3464300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3464400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3464500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3464600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3464700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3464800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3464900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3465000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3465100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3465200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3465300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3465400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3465500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3465600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3465700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3465800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3465900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3466000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3466100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3466200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3466300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3466400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3466500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3466600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3466700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3466800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3466900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3467000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3467100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3467200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3467300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3467400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3467500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3467600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3467700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3467800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3467900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3468000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3468100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3468200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3468300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3468400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3468500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3468600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3468700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3468800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3468900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3469000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3469100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3469200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3469300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3469400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3469500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3469600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3469700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3469800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3469900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3470000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3470100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3470200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3470300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3470400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3470500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3470600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3470700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3470800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3470900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3471000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3471100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3471200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3471300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3471400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3471500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3471600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3471700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0273, + "step": 3471800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3471900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3472000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3472100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3472200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3472300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3472400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3472500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3472600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3472700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3472800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3472900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3473000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3473100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3473200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3473300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3473400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3473500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3473600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3473700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3473800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3473900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3474000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3474100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3474200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3474300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3474400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3474500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3474600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3474700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3474800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3474900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3475000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3475100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3475200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3475300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3475400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3475500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3475600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3475700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3475800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3475900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3476000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3476100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3476200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3476300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3476400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3476500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3476600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3476700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3476800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3476900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3477000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3477100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3477200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3477300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3477400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3477500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3477600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3477700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3477800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3477900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3478000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3478100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3478200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3478300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3478400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3478500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3478600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3478700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3478800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3478900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3479000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3479100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3479200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3479300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3479400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3479500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3479600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3479700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3479800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3479900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3480000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0251617431640625, + "eval_runtime": 3063.2627, + "eval_samples_per_second": 367.165, + "eval_steps_per_second": 22.948, + "step": 3480000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3480100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3480200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3480300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3480400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3480500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3480600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3480700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3480800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3480900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3481000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3481100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3481200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3481300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3481400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3481500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3481600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3481700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3481800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3481900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3482000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3482100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3482200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3482300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3482400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3482500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3482600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3482700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3482800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3482900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3483000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3483100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3483200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3483300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3483400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3483500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3483600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3483700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3483800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3483900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3484000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3484100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3484200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3484300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3484400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3484500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3484600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3484700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3484800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3484900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3485000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3485100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3485200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3485300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3485400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3485500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3485600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3485700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3485800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3485900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3486000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3486100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3486200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3486300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3486400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3486500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3486600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3486700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3486800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3486900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3487000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3487100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3487200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3487300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3487400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3487500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3487600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3487700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3487800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3487900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3488000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3488100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3488200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3488300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3488400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3488500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3488600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3488700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3488800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3488900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3489000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3489100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3489200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3489300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3489400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3489500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3489600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3489700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3489800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3489900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3490000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3490100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3490200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3490300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3490400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3490500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3490600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3490700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3490800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3490900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3491000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3491100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3491200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3491300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3491400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3491500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3491600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3491700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3491800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3491900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3492000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3492100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3492200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3492300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3492400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3492500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3492600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3492700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3492800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3492900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3493000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3493100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3493200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3493300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3493400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3493500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3493600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3493700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3493800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3493900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3494000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3494100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3494200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3494300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3494400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3494500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3494600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3494700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3494800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3494900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3495000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3495100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3495200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3495300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3495400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3495500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3495600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3495700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3495800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3495900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3496000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3496100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3496200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3496300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3496400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3496500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3496600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3496700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3496800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3496900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3497000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3497100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3497200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3497300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3497400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3497500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3497600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3497700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3497800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3497900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3498000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3498100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3498200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3498300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3498400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3498500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3498600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3498700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3498800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3498900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3499000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3499100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3499200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3499300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3499400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3499500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3499600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3499700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3499800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3499900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3500000 + }, + { + "epoch": 0.0, + "eval_loss": 0.025115966796875, + "eval_runtime": 4339.9133, + "eval_samples_per_second": 259.158, + "eval_steps_per_second": 16.198, + "step": 3500000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3500100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3500200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3500300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3500400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3500500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0239, + "step": 3500600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3500700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3500800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3500900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3501000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3501100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3501200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3501300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3501400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3501500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3501600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3501700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3501800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3501900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3502000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3502100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3502200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3502300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3502400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3502500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3502600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3502700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3502800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3502900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3503000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3503100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3503200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3503300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3503400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3503500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3503600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3503700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3503800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3503900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3504000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3504100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3504200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3504300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3504400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3504500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3504600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3504700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3504800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3504900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3505000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3505100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3505200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3505300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3505400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3505500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3505600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3505700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3505800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3505900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3506000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3506100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3506200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3506300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3506400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3506500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.024, + "step": 3506600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3506700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3506800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3506900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3507000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3507100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3507200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3507300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3507400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3507500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3507600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3507700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3507800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3507900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3508000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3508100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3508200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3508300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3508400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3508500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3508600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3508700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3508800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3508900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3509000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3509100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3509200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3509300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3509400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3509500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3509600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3509700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3509800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3509900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3510000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3510100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3510200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3510300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3510400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3510500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3510600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3510700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3510800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3510900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3511000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3511100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3511200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3511300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3511400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3511500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3511600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3511700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3511800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3511900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3512000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3512100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3512200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3512300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3512400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3512500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3512600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3512700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3512800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3512900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3513000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3513100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3513200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3513300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3513400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3513500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3513600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3513700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3513800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3513900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3514000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3514100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3514200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3514300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3514400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3514500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3514600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3514700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3514800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3514900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3515000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3515100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3515200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3515300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3515400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3515500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3515600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3515700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3515800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3515900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3516000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3516100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3516200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3516300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3516400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3516500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0239, + "step": 3516600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3516700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3516800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3516900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3517000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3517100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3517200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3517300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3517400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3517500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3517600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3517700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3517800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3517900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3518000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3518100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3518200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3518300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3518400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3518500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3518600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3518700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3518800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3518900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3519000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3519100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3519200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3519300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3519400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3519500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0238, + "step": 3519600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3519700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3519800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3519900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3520000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0251007080078125, + "eval_runtime": 4332.2578, + "eval_samples_per_second": 259.616, + "eval_steps_per_second": 16.226, + "step": 3520000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3520100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3520200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3520300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3520400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3520500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3520600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3520700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3520800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3520900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3521000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3521100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3521200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3521300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3521400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3521500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3521600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3521700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3521800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3521900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3522000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3522100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3522200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3522300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3522400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3522500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3522600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3522700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3522800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3522900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3523000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3523100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3523200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3523300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3523400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3523500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3523600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3523700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3523800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3523900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3524000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3524100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3524200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3524300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3524400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3524500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3524600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3524700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3524800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3524900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3525000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3525100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3525200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3525300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3525400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3525500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3525600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3525700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3525800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3525900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3526000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3526100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3526200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3526300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3526400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3526500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3526600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3526700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3526800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3526900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3527000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3527100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3527200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3527300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3527400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3527500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3527600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3527700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3527800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3527900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3528000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3528100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3528200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3528300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3528400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3528500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3528600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3528700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3528800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3528900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3529000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3529100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3529200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3529300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3529400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3529500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3529600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3529700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3529800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3529900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3530000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3530100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3530200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3530300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3530400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3530500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3530600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3530700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3530800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3530900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3531000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3531100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3531200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3531300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3531400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3531500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3531600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3531700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3531800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3531900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3532000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3532100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3532200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3532300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3532400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3532500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3532600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3532700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3532800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3532900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3533000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3533100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3533200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3533300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3533400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3533500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3533600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3533700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3533800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3533900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3534000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3534100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3534200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3534300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3534400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3534500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3534600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3534700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3534800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3534900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3535000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3535100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3535200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3535300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3535400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3535500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3535600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3535700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3535800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3535900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3536000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3536100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3536200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3536300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3536400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3536500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3536600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3536700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3536800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3536900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3537000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3537100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3537200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3537300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3537400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3537500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3537600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3537700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3537800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3537900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3538000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3538100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3538200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3538300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3538400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3538500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3538600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3538700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3538800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3538900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3539000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3539100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3539200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3539300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3539400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3539500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3539600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3539700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3539800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3539900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3540000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02508544921875, + "eval_runtime": 4211.7411, + "eval_samples_per_second": 267.045, + "eval_steps_per_second": 16.69, + "step": 3540000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3540100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3540200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.024, + "step": 3540300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3540400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3540500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3540600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3540700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3540800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3540900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3541000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3541100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3541200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3541300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3541400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3541500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3541600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3541700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3541800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3541900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3542000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3542100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3542200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3542300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3542400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3542500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3542600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3542700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3542800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3542900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3543000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3543100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3543200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0241, + "step": 3543300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3543400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3543500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3543600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3543700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3543800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3543900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3544000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3544100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3544200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3544300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3544400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3544500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3544600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3544700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3544800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3544900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3545000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3545100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3545200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3545300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3545400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3545500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3545600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3545700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3545800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3545900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3546000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3546100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3546200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3546300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3546400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3546500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3546600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3546700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3546800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3546900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3547000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3547100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3547200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3547300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3547400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3547500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3547600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3547700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3547800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3547900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3548000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3548100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3548200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3548300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3548400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3548500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3548600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3548700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3548800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3548900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3549000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3549100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0238, + "step": 3549200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3549300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3549400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3549500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3549600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3549700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3549800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3549900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3550000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3550100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3550200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3550300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3550400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3550500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3550600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3550700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3550800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3550900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3551000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3551100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3551200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3551300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3551400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3551500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3551600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3551700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3551800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3551900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3552000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3552100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3552200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3552300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3552400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3552500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3552600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3552700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3552800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3552900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3553000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3553100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3553200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3553300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3553400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3553500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3553600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3553700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3553800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3553900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3554000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3554100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3554200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3554300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3554400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3554500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3554600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3554700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3554800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3554900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3555000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3555100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3555200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3555300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3555400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3555500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3555600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3555700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3555800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3555900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3556000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3556100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3556200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3556300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3556400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3556500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3556600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3556700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3556800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3556900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3557000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3557100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3557200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3557300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3557400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3557500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3557600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3557700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3557800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3557900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3558000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3558100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3558200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3558300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3558400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3558500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3558600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3558700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3558800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3558900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3559000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3559100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3559200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3559300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3559400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3559500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3559600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3559700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3559800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3559900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3560000 + }, + { + "epoch": 0.0, + "eval_loss": 0.025054931640625, + "eval_runtime": 3881.6639, + "eval_samples_per_second": 289.753, + "eval_steps_per_second": 18.11, + "step": 3560000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3560100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3560200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3560300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3560400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3560500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3560600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3560700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3560800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3560900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3561000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3561100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3561200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0239, + "step": 3561300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3561400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3561500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3561600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3561700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3561800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3561900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3562000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3562100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3562200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3562300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3562400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3562500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3562600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3562700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3562800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3562900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3563000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3563100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3563200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3563300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3563400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3563500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3563600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3563700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3563800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3563900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3564000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3564100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3564200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3564300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3564400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3564500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3564600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3564700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3564800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3564900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3565000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3565100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3565200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3565300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3565400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3565500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3565600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3565700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3565800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3565900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3566000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3566100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3566200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3566300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3566400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3566500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3566600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3566700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3566800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3566900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3567000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3567100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3567200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3567300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3567400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3567500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3567600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3567700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3567800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3567900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3568000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3568100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3568200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3568300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3568400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3568500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3568600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3568700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3568800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3568900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3569000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3569100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3569200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3569300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3569400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3569500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3569600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3569700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3569800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3569900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3570000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3570100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3570200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3570300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3570400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3570500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3570600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3570700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3570800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3570900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3571000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3571100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3571200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3571300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3571400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3571500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3571600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3571700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3571800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3571900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0272, + "step": 3572000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3572100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3572200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3572300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3572400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3572500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3572600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3572700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3572800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3572900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3573000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3573100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3573200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3573300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3573400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3573500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3573600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3573700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3573800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3573900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3574000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3574100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3574200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3574300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3574400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3574500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3574600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3574700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3574800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3574900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3575000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3575100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3575200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3575300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3575400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3575500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3575600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3575700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3575800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3575900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3576000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3576100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3576200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3576300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3576400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3576500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3576600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3576700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3576800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3576900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3577000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3577100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3577200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3577300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3577400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3577500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3577600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3577700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3577800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3577900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3578000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3578100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3578200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3578300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3578400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3578500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3578600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3578700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3578800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3578900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3579000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3579100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3579200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3579300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3579400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3579500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3579600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3579700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3579800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3579900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3580000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0250244140625, + "eval_runtime": 3782.3415, + "eval_samples_per_second": 297.362, + "eval_steps_per_second": 18.585, + "step": 3580000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3580100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3580200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3580300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3580400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3580500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3580600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3580700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0274, + "step": 3580800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3580900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3581000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3581100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3581200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3581300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3581400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.024, + "step": 3581500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3581600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3581700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3581800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3581900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3582000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3582100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3582200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3582300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3582400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3582500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3582600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3582700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3582800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3582900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3583000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3583100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3583200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3583300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3583400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3583500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3583600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3583700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3583800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3583900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3584000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3584100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3584200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3584300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3584400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3584500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3584600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3584700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3584800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3584900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3585000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3585100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3585200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3585300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3585400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3585500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3585600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3585700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0232, + "step": 3585800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3585900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3586000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3586100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3586200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3586300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3586400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3586500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3586600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3586700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3586800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3586900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3587000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0275, + "step": 3587100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0284, + "step": 3587200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3587300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3587400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3587500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3587600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3587700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3587800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3587900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3588000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3588100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3588200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3588300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3588400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3588500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3588600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3588700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3588800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3588900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3589000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3589100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3589200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3589300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3589400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3589500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3589600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3589700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3589800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3589900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3590000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3590100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3590200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3590300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3590400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3590500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3590600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3590700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3590800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3590900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3591000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3591100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3591200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3591300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3591400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3591500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3591600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3591700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3591800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3591900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3592000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3592100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3592200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3592300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3592400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3592500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3592600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3592700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3592800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3592900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3593000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3593100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3593200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3593300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3593400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3593500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3593600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3593700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3593800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3593900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3594000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3594100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3594200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3594300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3594400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3594500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0271, + "step": 3594600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3594700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3594800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3594900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3595000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0237, + "step": 3595100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3595200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3595300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3595400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3595500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3595600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3595700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3595800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3595900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3596000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3596100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3596200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3596300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3596400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3596500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3596600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3596700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3596800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3596900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3597000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3597100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3597200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3597300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3597400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3597500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3597600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3597700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3597800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3597900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3598000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3598100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3598200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3598300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3598400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3598500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3598600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0241, + "step": 3598700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3598800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3598900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3599000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3599100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3599200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3599300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3599400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3599500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3599600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3599700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3599800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3599900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3600000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0250396728515625, + "eval_runtime": 3978.3043, + "eval_samples_per_second": 282.714, + "eval_steps_per_second": 17.67, + "step": 3600000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3600100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3600200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3600300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3600400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3600500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3600600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3600700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3600800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3600900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3601000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3601100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3601200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3601300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3601400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3601500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3601600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3601700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3601800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3601900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3602000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3602100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3602200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3602300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3602400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3602500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3602600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3602700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3602800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3602900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3603000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3603100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3603200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3603300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3603400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3603500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3603600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3603700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3603800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3603900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3604000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3604100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3604200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3604300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3604400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3604500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3604600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3604700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3604800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3604900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3605000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3605100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3605200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3605300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3605400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3605500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3605600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3605700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3605800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3605900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3606000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3606100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3606200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3606300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3606400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3606500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3606600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3606700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3606800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3606900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3607000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3607100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3607200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3607300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3607400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3607500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3607600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3607700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3607800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3607900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3608000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3608100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3608200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3608300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3608400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3608500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3608600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3608700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3608800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3608900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3609000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3609100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3609200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3609300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3609400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3609500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3609600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3609700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3609800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3609900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3610000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3610100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3610200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3610300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3610400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3610500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3610600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3610700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3610800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3610900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3611000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3611100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3611200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3611300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3611400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3611500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3611600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3611700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3611800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3611900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3612000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3612100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3612200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3612300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3612400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3612500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3612600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3612700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0269, + "step": 3612800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3612900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3613000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3613100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3613200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3613300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3613400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3613500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3613600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3613700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3613800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3613900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3614000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3614100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3614200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3614300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3614400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3614500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3614600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3614700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3614800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3614900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3615000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3615100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3615200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3615300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3615400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3615500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3615600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3615700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3615800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3615900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3616000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3616100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3616200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3616300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3616400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3616500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3616600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3616700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3616800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3616900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3617000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3617100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3617200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3617300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3617400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0241, + "step": 3617500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3617600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3617700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3617800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3617900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3618000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3618100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3618200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3618300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3618400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3618500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3618600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3618700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3618800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3618900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3619000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3619100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3619200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3619300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3619400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3619500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3619600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3619700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3619800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3619900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3620000 + }, + { + "epoch": 0.0, + "eval_loss": 0.025054931640625, + "eval_runtime": 4907.8924, + "eval_samples_per_second": 229.166, + "eval_steps_per_second": 14.323, + "step": 3620000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3620100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3620200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3620300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3620400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3620500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3620600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3620700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3620800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3620900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3621000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3621100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3621200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3621300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3621400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3621500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3621600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3621700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3621800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3621900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3622000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0268, + "step": 3622100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3622200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3622300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3622400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3622500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3622600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3622700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3622800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3622900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3623000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3623100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3623200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3623300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3623400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3623500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3623600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3623700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3623800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3623900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3624000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3624100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3624200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3624300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3624400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3624500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3624600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3624700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3624800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.027, + "step": 3624900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3625000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3625100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3625200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3625300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3625400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3625500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3625600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3625700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3625800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3625900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3626000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3626100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3626200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3626300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3626400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3626500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0242, + "step": 3626600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3626700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3626800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3626900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3627000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3627100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3627200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3627300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3627400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3627500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3627600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3627700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3627800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3627900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3628000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3628100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3628200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3628300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3628400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3628500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0262, + "step": 3628600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3628700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3628800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3628900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3629000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0239, + "step": 3629100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3629200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3629300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3629400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3629500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3629600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3629700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3629800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3629900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3630000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0264, + "step": 3630100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0259, + "step": 3630200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3630300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3630400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3630500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3630600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3630700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3630800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3630900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3631000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3631100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3631200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3631300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3631400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3631500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3631600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3631700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0263, + "step": 3631800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3631900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3632000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3632100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0266, + "step": 3632200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3632300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3632400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3632500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3632600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3632700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3632800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3632900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3633000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3633100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3633200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3633300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3633400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3633500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3633600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3633700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3633800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3633900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3634000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3634100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3634200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3634300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3634400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3634500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3634600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3634700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3634800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3634900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3635000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3635100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0244, + "step": 3635200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3635300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3635400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0253, + "step": 3635500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3635600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3635700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3635800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3635900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3636000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3636100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3636200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3636300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3636400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3636500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3636600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3636700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3636800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3636900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3637000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3637100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3637200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0258, + "step": 3637300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.026, + "step": 3637400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3637500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0265, + "step": 3637600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0252, + "step": 3637700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3637800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0247, + "step": 3637900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3638000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0261, + "step": 3638100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3638200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0256, + "step": 3638300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3638400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3638500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0248, + "step": 3638600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0254, + "step": 3638700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.025, + "step": 3638800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3638900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0255, + "step": 3639000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3639100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0243, + "step": 3639200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0257, + "step": 3639300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0246, + "step": 3639400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0245, + "step": 3639500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3639600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0251, + "step": 3639700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0241, + "step": 3639800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0249, + "step": 3639900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 0.0267, + "step": 3640000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0250091552734375, + "eval_runtime": 5536.8339, + "eval_samples_per_second": 203.135, + "eval_steps_per_second": 12.696, + "step": 3640000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3640100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3640200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3640300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3640400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3640500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3640600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3640700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3640800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3640900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3641000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3641100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3641200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3641300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3641400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3641500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 3641600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3641700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3641800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3641900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3642000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3642100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3642200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3642300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3642400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3642500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3642600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0275, + "step": 3642700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3642800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3642900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3643000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3643100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3643200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3643300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3643400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3643500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3643600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3643700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3643800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3643900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3644000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3644100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 3644200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3644300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3644400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3644500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3644600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3644700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3644800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3644900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3645000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3645100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3645200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3645300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3645400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3645500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3645600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3645700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3645800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3645900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3646000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3646100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3646200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3646300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3646400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3646500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3646600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3646700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3646800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3646900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3647000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3647100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3647200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3647300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3647400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3647500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3647600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3647700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3647800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3647900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3648000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3648100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3648200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3648300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3648400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3648500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3648600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3648700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3648800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3648900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3649000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3649100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3649200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3649300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3649400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 3649500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3649600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3649700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3649800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3649900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3650000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3650100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3650200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3650300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3650400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3650500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3650600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3650700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3650800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3650900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3651000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3651100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3651200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3651300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3651400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3651500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3651600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3651700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3651800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3651900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3652000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3652100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3652200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3652300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3652400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3652500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3652600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3652700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3652800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3652900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3653000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3653100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3653200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3653300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3653400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3653500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3653600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3653700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3653800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3653900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3654000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3654100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3654200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3654300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3654400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3654500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3654600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3654700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3654800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3654900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3655000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3655100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3655200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3655300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3655400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3655500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3655600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3655700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3655800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 3655900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3656000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3656100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3656200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3656300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3656400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3656500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3656600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3656700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3656800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3656900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3657000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3657100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3657200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3657300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3657400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3657500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3657600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3657700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3657800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3657900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3658000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3658100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3658200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3658300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3658400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 3658500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3658600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3658700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3658800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3658900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3659000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3659100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3659200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3659300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3659400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3659500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3659600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3659700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3659800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3659900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3660000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0248870849609375, + "eval_runtime": 140.1439, + "eval_samples_per_second": 356.776, + "eval_steps_per_second": 22.299, + "step": 3660000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3660100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3660200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3660300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3660400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3660500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3660600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3660700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3660800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3660900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3661000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3661100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3661200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3661300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3661400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3661500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3661600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3661700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3661800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3661900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3662000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3662100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3662200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3662300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3662400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3662500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3662600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3662700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3662800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3662900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3663000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 3663100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3663200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3663300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3663400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3663500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3663600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3663700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3663800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3663900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3664000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3664100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3664200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3664300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3664400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3664500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3664600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3664700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3664800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3664900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3665000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3665100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3665200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 3665300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3665400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3665500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3665600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3665700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3665800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3665900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3666000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3666100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3666200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3666300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3666400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3666500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 3666600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3666700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3666800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3666900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3667000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3667100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3667200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3667300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3667400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3667500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3667600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3667700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3667800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3667900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3668000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3668100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3668200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3668300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3668400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3668500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3668600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3668700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3668800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3668900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3669000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3669100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3669200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3669300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3669400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3669500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3669600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3669700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3669800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3669900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0271, + "step": 3670000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3670100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3670200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 3670300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3670400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3670500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3670600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3670700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3670800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3670900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3671000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3671100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3671200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3671300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3671400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3671500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3671600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3671700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3671800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3671900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3672000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3672100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3672200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3672300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3672400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3672500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3672600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3672700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3672800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3672900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3673000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3673100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3673200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3673300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3673400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3673500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3673600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3673700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3673800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 3673900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3674000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3674100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3674200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3674300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3674400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3674500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3674600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3674700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3674800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3674900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3675000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3675100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3675200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3675300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3675400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3675500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 3675600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3675700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3675800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3675900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3676000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3676100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3676200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3676300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3676400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3676500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3676600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3676700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3676800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3676900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3677000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3677100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3677200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3677300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3677400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3677500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3677600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3677700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3677800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3677900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3678000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3678100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3678200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3678300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3678400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3678500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3678600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3678700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3678800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3678900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3679000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3679100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3679200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3679300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3679400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3679500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3679600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3679700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3679800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3679900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3680000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0248565673828125, + "eval_runtime": 139.6097, + "eval_samples_per_second": 358.141, + "eval_steps_per_second": 22.384, + "step": 3680000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3680100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3680200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3680300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3680400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3680500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3680600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3680700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3680800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3680900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3681000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3681100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3681200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3681300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3681400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3681500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3681600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3681700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3681800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3681900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3682000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3682100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3682200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3682300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3682400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3682500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3682600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3682700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3682800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3682900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3683000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3683100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3683200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3683300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3683400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3683500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3683600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3683700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3683800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3683900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3684000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3684100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3684200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3684300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3684400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3684500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3684600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3684700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3684800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3684900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3685000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3685100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3685200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3685300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3685400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3685500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3685600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3685700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3685800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3685900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3686000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3686100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3686200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3686300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3686400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3686500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3686600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3686700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3686800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3686900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3687000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3687100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3687200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3687300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3687400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3687500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3687600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3687700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3687800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3687900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3688000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3688100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3688200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3688300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3688400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3688500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3688600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3688700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3688800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3688900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3689000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3689100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3689200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3689300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3689400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3689500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3689600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3689700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3689800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3689900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3690000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3690100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3690200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 3690300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3690400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3690500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3690600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3690700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3690800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3690900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3691000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3691100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3691200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3691300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3691400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3691500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3691600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3691700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3691800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3691900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3692000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3692100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3692200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3692300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3692400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3692500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3692600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3692700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3692800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3692900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3693000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3693100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3693200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3693300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3693400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3693500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3693600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3693700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3693800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3693900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3694000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3694100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3694200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3694300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3694400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3694500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3694600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3694700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3694800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3694900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3695000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3695100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3695200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3695300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3695400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3695500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3695600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3695700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3695800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3695900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3696000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3696100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3696200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3696300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3696400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3696500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3696600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3696700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3696800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3696900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3697000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3697100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3697200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3697300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3697400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3697500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3697600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3697700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3697800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3697900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3698000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3698100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3698200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3698300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3698400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3698500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3698600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3698700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3698800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3698900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3699000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3699100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3699200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3699300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3699400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3699500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3699600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3699700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 3699800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3699900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3700000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02484130859375, + "eval_runtime": 131.8971, + "eval_samples_per_second": 379.083, + "eval_steps_per_second": 23.693, + "step": 3700000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3700100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3700200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3700300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3700400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3700500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3700600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3700700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3700800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3700900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3701000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3701100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3701200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3701300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3701400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3701500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3701600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3701700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3701800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3701900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3702000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3702100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3702200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3702300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3702400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3702500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3702600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3702700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3702800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3702900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3703000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3703100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3703200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3703300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3703400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3703500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3703600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3703700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3703800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3703900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3704000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3704100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3704200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3704300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3704400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3704500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3704600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3704700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 3704800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3704900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3705000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3705100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3705200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3705300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3705400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3705500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3705600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3705700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3705800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3705900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3706000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3706100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3706200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3706300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3706400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3706500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3706600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3706700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3706800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3706900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3707000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3707100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3707200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3707300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3707400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3707500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3707600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3707700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3707800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3707900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3708000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3708100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3708200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3708300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3708400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3708500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3708600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3708700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3708800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3708900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3709000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3709100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3709200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3709300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3709400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3709500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3709600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3709700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3709800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3709900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3710000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3710100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3710200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3710300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3710400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3710500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3710600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3710700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3710800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3710900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3711000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3711100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3711200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3711300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3711400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3711500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3711600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3711700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3711800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3711900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3712000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3712100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3712200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3712300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3712400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3712500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3712600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3712700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3712800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3712900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3713000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3713100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3713200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3713300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3713400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3713500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3713600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3713700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3713800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3713900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3714000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3714100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3714200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3714300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3714400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3714500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3714600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3714700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3714800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3714900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3715000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3715100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3715200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3715300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3715400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3715500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3715600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3715700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3715800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3715900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3716000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3716100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3716200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3716300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3716400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3716500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3716600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3716700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3716800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3716900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3717000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3717100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3717200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3717300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3717400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3717500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3717600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3717700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3717800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3717900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3718000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3718100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3718200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3718300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3718400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3718500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3718600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3718700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3718800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3718900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 3719000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3719100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3719200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3719300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3719400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3719500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3719600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3719700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3719800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3719900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3720000 + }, + { + "epoch": 0.0, + "eval_loss": 0.02484130859375, + "eval_runtime": 132.2399, + "eval_samples_per_second": 378.101, + "eval_steps_per_second": 23.631, + "step": 3720000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 3720100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3720200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3720300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3720400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3720500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3720600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3720700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3720800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3720900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3721000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3721100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3721200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3721300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3721400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3721500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3721600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3721700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3721800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3721900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3722000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3722100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3722200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3722300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3722400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3722500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3722600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3722700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3722800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3722900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3723000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3723100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3723200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3723300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3723400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3723500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3723600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3723700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3723800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3723900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3724000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 3724100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3724200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3724300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3724400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3724500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 3724600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3724700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3724800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3724900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3725000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3725100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3725200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3725300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3725400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3725500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3725600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3725700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 3725800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3725900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3726000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3726100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3726200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3726300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3726400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3726500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3726600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3726700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3726800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3726900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3727000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3727100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3727200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3727300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3727400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3727500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3727600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3727700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3727800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3727900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3728000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3728100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3728200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3728300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3728400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3728500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3728600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3728700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3728800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3728900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3729000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3729100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3729200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3729300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3729400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3729500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3729600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3729700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3729800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3729900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3730000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3730100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3730200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3730300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3730400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3730500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3730600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3730700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3730800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3730900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3731000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3731100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3731200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3731300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3731400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3731500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3731600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3731700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3731800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3731900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3732000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3732100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3732200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3732300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3732400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3732500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3732600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3732700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3732800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3732900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3733000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3733100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3733200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3733300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3733400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3733500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3733600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3733700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3733800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3733900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3734000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3734100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3734200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3734300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3734400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3734500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3734600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3734700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3734800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3734900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3735000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3735100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3735200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3735300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3735400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3735500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3735600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3735700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3735800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3735900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3736000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3736100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3736200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3736300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3736400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3736500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3736600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3736700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3736800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3736900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3737000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3737100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3737200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3737300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3737400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3737500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3737600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3737700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3737800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3737900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3738000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3738100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3738200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3738300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3738400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3738500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3738600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3738700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3738800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3738900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3739000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3739100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3739200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3739300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3739400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3739500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3739600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3739700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3739800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3739900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3740000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247955322265625, + "eval_runtime": 134.092, + "eval_samples_per_second": 372.878, + "eval_steps_per_second": 23.305, + "step": 3740000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3740100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3740200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3740300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3740400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3740500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3740600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3740700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 3740800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3740900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3741000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3741100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3741200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3741300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3741400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3741500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3741600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3741700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3741800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3741900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3742000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3742100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3742200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3742300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3742400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3742500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3742600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3742700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3742800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3742900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3743000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3743100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3743200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3743300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3743400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3743500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3743600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3743700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3743800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3743900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3744000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3744100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3744200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3744300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3744400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3744500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3744600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3744700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3744800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3744900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3745000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3745100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3745200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3745300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3745400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3745500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3745600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3745700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3745800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3745900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3746000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3746100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3746200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3746300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3746400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3746500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3746600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3746700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3746800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3746900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3747000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3747100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3747200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3747300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3747400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3747500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3747600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3747700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3747800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3747900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3748000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3748100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3748200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3748300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3748400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3748500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3748600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3748700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3748800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3748900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3749000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3749100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3749200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3749300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3749400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3749500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3749600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3749700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3749800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3749900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3750000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3750100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3750200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3750300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3750400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3750500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3750600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3750700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3750800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3750900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3751000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3751100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3751200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3751300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3751400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3751500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3751600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3751700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3751800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3751900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 3752000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3752100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3752200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3752300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3752400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3752500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3752600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3752700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3752800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3752900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3753000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3753100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3753200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3753300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3753400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3753500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3753600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3753700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3753800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3753900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3754000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3754100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3754200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3754300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3754400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3754500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3754600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3754700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3754800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3754900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3755000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3755100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3755200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3755300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3755400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3755500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3755600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3755700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3755800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3755900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3756000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3756100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3756200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3756300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3756400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3756500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3756600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3756700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3756800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3756900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3757000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3757100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3757200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3757300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3757400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3757500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3757600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3757700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3757800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3757900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 3758000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3758100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3758200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3758300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3758400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3758500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3758600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3758700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3758800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3758900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3759000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3759100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3759200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3759300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3759400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3759500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3759600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3759700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3759800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3759900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3760000 + }, + { + "epoch": 0.0, + "eval_loss": 0.024810791015625, + "eval_runtime": 133.8117, + "eval_samples_per_second": 373.66, + "eval_steps_per_second": 23.354, + "step": 3760000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3760100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3760200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3760300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3760400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3760500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3760600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3760700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3760800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3760900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3761000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3761100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3761200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3761300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3761400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3761500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3761600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3761700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3761800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3761900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3762000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3762100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3762200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3762300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3762400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3762500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3762600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3762700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3762800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3762900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3763000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3763100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3763200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3763300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3763400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3763500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3763600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3763700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3763800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3763900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3764000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3764100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3764200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3764300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3764400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3764500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3764600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3764700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3764800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3764900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3765000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3765100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3765200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3765300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3765400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3765500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3765600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3765700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3765800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3765900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3766000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3766100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3766200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3766300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3766400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3766500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3766600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3766700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3766800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3766900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3767000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3767100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3767200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3767300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3767400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3767500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3767600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3767700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3767800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3767900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3768000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3768100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3768200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3768300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3768400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3768500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3768600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3768700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3768800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3768900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3769000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3769100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3769200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3769300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3769400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3769500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3769600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3769700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3769800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3769900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3770000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3770100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3770200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3770300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3770400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3770500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3770600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3770700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3770800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3770900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3771000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3771100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3771200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3771300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3771400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3771500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3771600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3771700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3771800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3771900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3772000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3772100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 3772200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3772300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3772400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3772500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3772600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3772700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3772800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3772900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3773000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3773100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3773200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3773300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3773400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3773500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3773600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3773700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3773800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3773900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3774000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3774100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3774200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3774300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3774400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3774500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3774600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3774700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3774800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0271, + "step": 3774900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3775000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3775100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3775200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3775300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3775400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3775500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3775600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3775700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3775800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3775900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3776000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3776100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3776200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3776300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3776400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3776500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3776600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3776700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3776800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3776900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3777000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3777100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3777200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3777300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3777400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3777500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3777600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3777700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3777800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3777900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3778000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3778100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3778200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3778300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3778400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3778500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3778600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3778700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3778800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3778900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3779000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3779100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3779200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3779300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3779400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3779500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3779600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3779700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3779800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3779900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3780000 + }, + { + "epoch": 0.0, + "eval_loss": 0.024810791015625, + "eval_runtime": 129.0238, + "eval_samples_per_second": 387.525, + "eval_steps_per_second": 24.22, + "step": 3780000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3780100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3780200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3780300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3780400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3780500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3780600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3780700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3780800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3780900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3781000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3781100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3781200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3781300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3781400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3781500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3781600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3781700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3781800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3781900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3782000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3782100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3782200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3782300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3782400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3782500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3782600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3782700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3782800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0273, + "step": 3782900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3783000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3783100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3783200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3783300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3783400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3783500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3783600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3783700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3783800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3783900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3784000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3784100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3784200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3784300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3784400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3784500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3784600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3784700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3784800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3784900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3785000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3785100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3785200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3785300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3785400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3785500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3785600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3785700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3785800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3785900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3786000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3786100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3786200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3786300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3786400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3786500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3786600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3786700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3786800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3786900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3787000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3787100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3787200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3787300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3787400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3787500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3787600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3787700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3787800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3787900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3788000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3788100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3788200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3788300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3788400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3788500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3788600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3788700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3788800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3788900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3789000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3789100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3789200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3789300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3789400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3789500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3789600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3789700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3789800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3789900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3790000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3790100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3790200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3790300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3790400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3790500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3790600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3790700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3790800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3790900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3791000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3791100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3791200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3791300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3791400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3791500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3791600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3791700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3791800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3791900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3792000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3792100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3792200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3792300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3792400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3792500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3792600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3792700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3792800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3792900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3793000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3793100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3793200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3793300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3793400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3793500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3793600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3793700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3793800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3793900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3794000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3794100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3794200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3794300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3794400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3794500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3794600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3794700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3794800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3794900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3795000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3795100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3795200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3795300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3795400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3795500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3795600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3795700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3795800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3795900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3796000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3796100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3796200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3796300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3796400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3796500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3796600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3796700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3796800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3796900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3797000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3797100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3797200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3797300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3797400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3797500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3797600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3797700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3797800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3797900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3798000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3798100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3798200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3798300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3798400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3798500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3798600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3798700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3798800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3798900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3799000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3799100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3799200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3799300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3799400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3799500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3799600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3799700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3799800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3799900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3800000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247955322265625, + "eval_runtime": 133.8194, + "eval_samples_per_second": 373.638, + "eval_steps_per_second": 23.352, + "step": 3800000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3800100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3800200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3800300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3800400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3800500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3800600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3800700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3800800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3800900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3801000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3801100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3801200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3801300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3801400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3801500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3801600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3801700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3801800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3801900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3802000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3802100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3802200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3802300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3802400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3802500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3802600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3802700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3802800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3802900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3803000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3803100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3803200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3803300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3803400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3803500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 3803600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3803700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3803800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3803900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3804000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3804100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3804200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3804300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3804400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3804500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3804600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3804700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3804800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3804900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3805000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3805100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3805200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3805300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3805400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3805500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3805600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3805700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3805800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3805900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3806000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3806100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3806200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 3806300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3806400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3806500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3806600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3806700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 3806800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3806900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3807000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3807100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3807200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3807300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3807400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3807500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3807600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3807700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3807800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3807900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3808000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3808100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3808200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3808300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3808400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3808500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3808600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3808700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3808800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3808900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3809000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3809100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3809200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3809300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3809400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3809500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3809600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3809700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3809800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3809900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3810000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3810100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3810200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3810300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3810400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3810500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3810600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3810700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3810800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3810900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3811000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3811100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3811200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3811300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3811400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3811500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3811600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3811700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3811800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3811900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3812000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3812100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3812200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3812300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3812400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3812500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3812600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3812700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3812800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3812900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3813000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3813100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3813200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3813300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3813400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3813500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3813600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3813700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3813800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3813900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3814000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3814100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3814200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3814300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3814400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3814500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3814600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3814700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3814800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3814900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3815000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3815100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3815200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3815300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3815400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3815500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3815600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3815700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3815800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3815900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3816000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3816100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3816200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3816300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3816400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3816500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3816600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3816700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3816800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3816900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3817000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3817100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3817200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3817300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3817400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3817500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3817600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3817700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3817800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3817900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3818000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3818100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3818200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3818300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3818400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3818500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3818600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3818700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3818800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3818900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3819000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3819100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3819200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3819300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3819400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3819500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3819600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3819700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3819800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3819900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3820000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247802734375, + "eval_runtime": 126.1573, + "eval_samples_per_second": 396.331, + "eval_steps_per_second": 24.771, + "step": 3820000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3820100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3820200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3820300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3820400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3820500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3820600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3820700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3820800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3820900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3821000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3821100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3821200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3821300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3821400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3821500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3821600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3821700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3821800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3821900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3822000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3822100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3822200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3822300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3822400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3822500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3822600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3822700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3822800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3822900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3823000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3823100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3823200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3823300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3823400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3823500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3823600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3823700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3823800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3823900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3824000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3824100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3824200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3824300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3824400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3824500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3824600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3824700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3824800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 3824900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3825000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3825100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3825200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3825300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3825400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3825500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3825600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3825700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3825800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3825900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3826000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3826100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3826200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3826300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3826400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3826500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3826600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3826700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3826800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3826900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3827000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3827100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3827200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3827300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3827400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3827500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3827600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3827700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3827800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3827900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3828000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3828100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3828200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3828300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3828400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3828500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3828600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3828700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3828800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3828900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3829000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3829100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3829200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3829300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3829400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3829500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3829600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3829700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3829800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3829900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3830000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3830100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3830200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3830300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3830400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3830500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3830600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3830700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3830800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3830900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3831000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3831100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3831200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3831300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3831400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3831500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3831600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3831700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3831800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3831900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3832000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3832100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3832200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3832300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3832400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3832500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3832600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3832700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3832800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3832900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3833000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3833100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3833200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3833300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3833400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3833500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3833600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3833700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3833800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3833900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3834000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3834100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3834200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3834300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3834400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3834500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3834600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3834700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3834800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3834900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3835000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3835100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3835200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3835300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3835400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3835500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3835600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3835700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3835800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3835900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3836000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3836100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3836200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3836300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3836400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3836500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3836600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3836700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3836800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3836900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3837000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 3837100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3837200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3837300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3837400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3837500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3837600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3837700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3837800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3837900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3838000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3838100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3838200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3838300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3838400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3838500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3838600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3838700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3838800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3838900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3839000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3839100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3839200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3839300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3839400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3839500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3839600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3839700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 3839800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3839900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3840000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247955322265625, + "eval_runtime": 136.6451, + "eval_samples_per_second": 365.911, + "eval_steps_per_second": 22.869, + "step": 3840000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3840100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3840200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3840300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3840400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3840500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3840600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3840700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3840800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3840900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3841000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3841100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3841200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3841300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3841400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3841500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3841600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3841700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3841800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3841900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3842000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3842100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3842200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3842300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3842400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3842500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3842600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3842700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3842800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3842900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3843000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3843100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3843200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3843300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3843400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3843500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3843600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3843700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3843800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3843900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3844000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3844100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3844200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3844300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3844400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3844500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3844600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3844700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3844800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3844900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3845000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3845100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3845200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3845300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3845400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3845500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3845600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3845700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3845800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3845900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3846000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3846100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3846200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 3846300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3846400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 3846500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3846600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3846700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3846800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3846900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3847000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3847100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3847200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3847300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3847400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3847500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3847600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3847700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3847800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3847900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3848000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3848100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3848200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3848300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3848400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 3848500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3848600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3848700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3848800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 3848900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3849000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3849100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3849200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3849300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3849400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3849500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3849600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3849700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3849800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3849900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3850000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3850100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3850200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3850300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3850400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3850500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3850600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3850700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3850800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3850900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3851000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3851100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3851200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3851300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3851400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3851500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3851600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3851700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3851800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3851900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3852000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3852100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3852200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3852300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3852400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3852500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3852600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3852700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3852800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3852900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3853000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3853100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3853200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3853300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3853400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 3853500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3853600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3853700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3853800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3853900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3854000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3854100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3854200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3854300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3854400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3854500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3854600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3854700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3854800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3854900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3855000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3855100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3855200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3855300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3855400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3855500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3855600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3855700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3855800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3855900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3856000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3856100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3856200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3856300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3856400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3856500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3856600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3856700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3856800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3856900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3857000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3857100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3857200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3857300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3857400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3857500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3857600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3857700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3857800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3857900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3858000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3858100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3858200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3858300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3858400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3858500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3858600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3858700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3858800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3858900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3859000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3859100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3859200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3859300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3859400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3859500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3859600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3859700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3859800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3859900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3860000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247802734375, + "eval_runtime": 141.9744, + "eval_samples_per_second": 352.176, + "eval_steps_per_second": 22.011, + "step": 3860000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3860100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3860200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3860300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3860400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3860500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3860600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3860700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3860800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3860900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3861000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3861100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3861200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3861300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3861400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3861500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3861600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3861700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3861800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3861900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3862000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3862100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3862200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3862300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3862400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3862500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3862600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3862700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3862800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3862900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3863000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 3863100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3863200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3863300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3863400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3863500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3863600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3863700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3863800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3863900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3864000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3864100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3864200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3864300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3864400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3864500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3864600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3864700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3864800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3864900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3865000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3865100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3865200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3865300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3865400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 3865500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3865600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3865700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3865800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3865900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3866000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3866100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3866200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3866300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3866400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3866500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3866600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3866700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3866800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3866900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3867000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3867100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3867200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3867300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3867400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3867500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3867600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3867700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3867800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3867900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3868000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3868100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3868200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3868300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3868400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3868500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3868600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 3868700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3868800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3868900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3869000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3869100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3869200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3869300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3869400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3869500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3869600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3869700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3869800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3869900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3870000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3870100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3870200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3870300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3870400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3870500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3870600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3870700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3870800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3870900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3871000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3871100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3871200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3871300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3871400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3871500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3871600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3871700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3871800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 3871900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3872000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 3872100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3872200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3872300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3872400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3872500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3872600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3872700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3872800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3872900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3873000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3873100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3873200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3873300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3873400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3873500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3873600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3873700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3873800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3873900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3874000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 3874100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3874200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3874300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3874400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 3874500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3874600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 3874700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3874800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3874900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3875000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3875100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3875200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3875300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3875400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3875500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3875600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3875700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3875800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3875900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3876000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3876100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3876200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 3876300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3876400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 3876500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3876600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3876700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3876800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3876900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3877000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3877100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3877200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3877300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 3877400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 3877500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 3877600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3877700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3877800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 3877900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 3878000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3878100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3878200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0243, + "step": 3878300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3878400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 3878500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 3878600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3878700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 3878800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 3878900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 3879000 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3879100 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 3879200 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0236, + "step": 3879300 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 3879400 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 3879500 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3879600 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 3879700 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 3879800 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 3879900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 3880000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247955322265625, + "eval_runtime": 141.1964, + "eval_samples_per_second": 354.117, + "eval_steps_per_second": 22.132, + "step": 3880000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3880100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3880200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3880300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3880400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3880500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3880600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3880700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3880800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3880900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3881000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3881100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3881200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3881300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3881400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3881500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3881600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3881700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3881800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3881900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3882000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3882100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3882200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3882300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3882400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3882500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3882600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3882700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3882800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3882900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 3883000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3883100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3883200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3883300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3883400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3883500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3883600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3883700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3883800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3883900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3884000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3884100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3884200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3884300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3884400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3884500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3884600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3884700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3884800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3884900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3885000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0232, + "step": 3885100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3885200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3885300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3885400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3885500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3885600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 3885700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3885800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3885900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3886000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3886100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3886200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3886300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3886400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3886500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3886600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3886700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3886800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3886900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3887000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3887100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3887200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3887300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3887400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3887500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3887600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3887700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3887800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3887900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3888000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3888100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3888200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3888300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3888400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3888500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.027, + "step": 3888600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3888700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3888800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0234, + "step": 3888900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0267, + "step": 3889000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3889100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3889200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3889300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3889400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3889500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3889600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3889700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3889800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3889900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3890000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3890100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3890200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3890300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0272, + "step": 3890400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3890500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3890600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3890700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3890800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3890900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3891000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3891100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3891200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3891300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3891400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3891500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3891600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3891700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3891800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3891900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3892000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3892100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3892200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3892300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3892400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3892500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3892600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3892700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3892800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3892900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3893000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3893100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3893200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0265, + "step": 3893300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3893400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3893500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3893600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3893700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3893800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3893900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3894000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 3894100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3894200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3894300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3894400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3894500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3894600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3894700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3894800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3894900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3895000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3895100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3895200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3895300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3895400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3895500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3895600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3895700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3895800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3895900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3896000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3896100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0265, + "step": 3896200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3896300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3896400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3896500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3896600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3896700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3896800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3896900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3897000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3897100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3897200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3897300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3897400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3897500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3897600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3897700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3897800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3897900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3898000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3898100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 3898200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3898300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3898400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3898500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3898600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3898700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3898800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3898900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3899000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3899100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3899200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3899300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3899400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3899500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3899600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3899700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3899800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3899900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3900000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247802734375, + "eval_runtime": 138.0188, + "eval_samples_per_second": 362.27, + "eval_steps_per_second": 22.642, + "step": 3900000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3900100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3900200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3900300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3900400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3900500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3900600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3900700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3900800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3900900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3901000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3901100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3901200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3901300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3901400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3901500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3901600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3901700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3901800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3901900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3902000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3902100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3902200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3902300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3902400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3902500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3902600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3902700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3902800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3902900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3903000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3903100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3903200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3903300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3903400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3903500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3903600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3903700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3903800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3903900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3904000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3904100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3904200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3904300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3904400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3904500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3904600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3904700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3904800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3904900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3905000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3905100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3905200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3905300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3905400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3905500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3905600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3905700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3905800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3905900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3906000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3906100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3906200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3906300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3906400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3906500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3906600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3906700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3906800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3906900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3907000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3907100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3907200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3907300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0267, + "step": 3907400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3907500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3907600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3907700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3907800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3907900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3908000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3908100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3908200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3908300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3908400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3908500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3908600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3908700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3908800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3908900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3909000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3909100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3909200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3909300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3909400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3909500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3909600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3909700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3909800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3909900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3910000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3910100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3910200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3910300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3910400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3910500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3910600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3910700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3910800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3910900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3911000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3911100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3911200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3911300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3911400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3911500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3911600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3911700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3911800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3911900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3912000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3912100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3912200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3912300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3912400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3912500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3912600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3912700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3912800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3912900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3913000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3913100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3913200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3913300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3913400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3913500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3913600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3913700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3913800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3913900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3914000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3914100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0267, + "step": 3914200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3914300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3914400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3914500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3914600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3914700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3914800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3914900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3915000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3915100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3915200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3915300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3915400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3915500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3915600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3915700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3915800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3915900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3916000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3916100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3916200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3916300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3916400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3916500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3916600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3916700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3916800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3916900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3917000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3917100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3917200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3917300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3917400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3917500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3917600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3917700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3917800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3917900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3918000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3918100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3918200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3918300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3918400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3918500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3918600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3918700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3918800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3918900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3919000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3919100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3919200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3919300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3919400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3919500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3919600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3919700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3919800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3919900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3920000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247802734375, + "eval_runtime": 139.7293, + "eval_samples_per_second": 357.835, + "eval_steps_per_second": 22.365, + "step": 3920000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3920100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3920200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3920300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3920400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3920500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3920600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3920700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3920800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3920900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3921000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3921100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3921200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3921300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3921400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3921500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3921600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3921700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3921800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3921900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3922000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3922100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3922200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3922300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3922400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3922500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3922600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3922700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3922800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3922900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3923000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3923100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3923200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3923300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3923400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3923500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3923600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3923700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3923800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3923900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3924000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3924100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3924200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3924300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3924400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3924500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3924600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3924700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3924800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3924900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3925000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3925100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0265, + "step": 3925200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3925300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3925400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3925500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3925600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3925700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3925800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3925900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3926000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3926100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3926200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3926300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3926400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3926500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3926600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3926700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3926800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3926900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3927000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3927100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3927200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3927300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3927400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3927500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3927600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3927700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3927800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3927900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3928000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3928100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3928200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3928300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3928400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3928500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3928600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3928700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3928800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3928900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3929000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3929100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3929200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3929300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3929400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3929500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3929600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3929700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3929800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3929900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3930000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3930100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3930200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3930300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3930400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3930500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3930600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0235, + "step": 3930700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3930800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3930900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3931000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3931100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3931200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3931300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3931400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3931500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3931600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3931700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3931800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3931900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3932000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3932100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3932200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3932300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3932400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3932500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3932600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3932700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3932800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3932900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3933000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3933100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3933200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3933300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3933400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3933500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3933600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3933700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3933800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3933900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3934000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3934100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3934200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3934300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3934400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3934500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3934600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3934700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3934800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3934900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3935000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3935100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3935200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3935300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3935400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3935500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3935600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3935700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3935800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3935900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3936000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3936100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3936200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3936300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3936400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3936500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3936600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3936700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3936800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3936900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3937000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3937100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3937200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3937300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3937400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3937500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3937600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3937700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3937800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3937900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3938000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3938100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3938200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3938300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3938400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3938500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3938600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3938700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3938800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3938900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3939000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3939100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3939200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3939300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3939400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3939500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3939600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3939700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3939800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3939900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3940000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247650146484375, + "eval_runtime": 138.4031, + "eval_samples_per_second": 361.264, + "eval_steps_per_second": 22.579, + "step": 3940000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3940100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3940200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3940300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3940400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3940500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3940600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3940700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3940800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3940900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3941000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3941100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3941200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3941300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3941400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3941500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3941600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3941700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3941800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3941900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3942000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3942100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0236, + "step": 3942200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 3942300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3942400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3942500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3942600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3942700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3942800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3942900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3943000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3943100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3943200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3943300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3943400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3943500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3943600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3943700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3943800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3943900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3944000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3944100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3944200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3944300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0265, + "step": 3944400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3944500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3944600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3944700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3944800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3944900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3945000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3945100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3945200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3945300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0266, + "step": 3945400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3945500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 3945600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3945700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3945800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3945900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3946000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3946100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3946200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3946300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3946400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3946500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3946600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3946700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3946800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3946900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3947000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3947100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3947200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3947300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3947400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3947500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3947600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3947700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3947800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3947900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3948000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3948100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3948200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3948300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 3948400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3948500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 3948600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3948700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3948800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3948900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3949000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3949100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3949200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3949300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3949400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3949500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3949600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3949700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3949800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3949900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3950000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3950100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3950200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3950300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3950400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3950500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3950600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0235, + "step": 3950700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3950800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3950900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3951000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3951100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3951200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3951300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3951400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3951500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3951600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3951700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3951800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3951900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3952000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3952100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3952200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3952300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3952400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3952500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3952600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3952700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3952800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3952900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3953000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3953100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3953200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3953300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3953400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3953500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3953600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3953700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3953800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0265, + "step": 3953900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3954000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3954100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3954200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3954300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3954400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3954500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3954600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3954700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3954800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3954900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3955000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3955100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3955200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3955300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3955400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3955500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3955600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3955700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3955800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3955900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3956000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3956100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3956200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3956300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3956400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3956500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3956600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3956700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3956800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3956900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3957000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3957100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3957200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3957300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3957400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 3957500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3957600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3957700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3957800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3957900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3958000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3958100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3958200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3958300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3958400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3958500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3958600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 3958700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3958800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3958900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3959000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3959100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3959200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0266, + "step": 3959300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3959400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3959500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3959600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3959700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3959800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3959900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3960000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247650146484375, + "eval_runtime": 143.2715, + "eval_samples_per_second": 348.988, + "eval_steps_per_second": 21.812, + "step": 3960000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3960100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3960200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3960300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3960400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3960500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3960600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3960700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3960800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3960900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3961000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3961100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3961200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3961300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3961400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3961500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3961600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3961700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3961800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3961900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3962000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3962100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3962200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3962300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3962400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3962500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3962600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 3962700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3962800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3962900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3963000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3963100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3963200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3963300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3963400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3963500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3963600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3963700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3963800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 3963900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3964000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3964100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3964200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3964300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3964400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3964500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3964600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3964700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3964800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3964900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3965000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3965100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3965200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3965300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3965400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3965500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3965600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3965700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3965800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3965900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3966000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3966100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3966200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3966300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3966400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3966500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3966600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3966700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3966800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3966900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3967000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3967100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3967200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3967300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3967400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3967500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3967600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3967700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3967800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3967900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3968000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3968100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3968200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3968300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3968400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3968500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3968600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.027, + "step": 3968700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3968800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3968900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3969000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3969100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3969200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3969300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3969400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3969500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 3969600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3969700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3969800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3969900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3970000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3970100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3970200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3970300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3970400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3970500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3970600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3970700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3970800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3970900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3971000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3971100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3971200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3971300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3971400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3971500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3971600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3971700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3971800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3971900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3972000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3972100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3972200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3972300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3972400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3972500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3972600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3972700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3972800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3972900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3973000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3973100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3973200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3973300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3973400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3973500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3973600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3973700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3973800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3973900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3974000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3974100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3974200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3974300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3974400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3974500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3974600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3974700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3974800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3974900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3975000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3975100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3975200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3975300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3975400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3975500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3975600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3975700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0269, + "step": 3975800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3975900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3976000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3976100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3976200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3976300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3976400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3976500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3976600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3976700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3976800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3976900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3977000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3977100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3977200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3977300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3977400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3977500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3977600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3977700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3977800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3977900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3978000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3978100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3978200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3978300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3978400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3978500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3978600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3978700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3978800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3978900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3979000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3979100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3979200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3979300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3979400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3979500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3979600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3979700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3979800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3979900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3980000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247650146484375, + "eval_runtime": 141.3953, + "eval_samples_per_second": 353.618, + "eval_steps_per_second": 22.101, + "step": 3980000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3980100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3980200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3980300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3980400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3980500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3980600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3980700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3980800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3980900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3981000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3981100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3981200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3981300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3981400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3981500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3981600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3981700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3981800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3981900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3982000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3982100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3982200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3982300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3982400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3982500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3982600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3982700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3982800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3982900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3983000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3983100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 3983200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3983300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3983400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3983500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3983600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 3983700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3983800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3983900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3984000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 3984100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3984200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3984300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3984400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3984500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3984600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3984700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3984800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3984900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3985000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3985100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 3985200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3985300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3985400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3985500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3985600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3985700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3985800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3985900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3986000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3986100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3986200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3986300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3986400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3986500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3986600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3986700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 3986800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3986900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3987000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3987100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3987200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3987300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3987400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3987500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3987600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3987700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3987800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3987900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3988000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3988100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3988200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3988300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3988400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3988500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3988600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3988700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 3988800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3988900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3989000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3989100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3989200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3989300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3989400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3989500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3989600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3989700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3989800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3989900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3990000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3990100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3990200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 3990300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 3990400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3990500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3990600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3990700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3990800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3990900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3991000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3991100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3991200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3991300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0272, + "step": 3991400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3991500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3991600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3991700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3991800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3991900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3992000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3992100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3992200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3992300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3992400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3992500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3992600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3992700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3992800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3992900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3993000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 3993100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3993200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3993300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3993400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3993500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3993600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3993700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3993800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3993900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3994000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3994100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3994200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3994300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3994400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3994500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3994600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3994700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3994800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3994900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3995000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3995100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3995200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 3995300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3995400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3995500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3995600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3995700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3995800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3995900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 3996000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3996100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 3996200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3996300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3996400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 3996500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 3996600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3996700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 3996800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 3996900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 3997000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3997100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3997200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3997300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 3997400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0235, + "step": 3997500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3997600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3997700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3997800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3997900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 3998000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3998100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3998200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 3998300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3998400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3998500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3998600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 3998700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3998800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 3998900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3999000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 3999100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 3999200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 3999300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3999400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 3999500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3999600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 3999700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 3999800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 3999900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4000000 + }, + { + "epoch": 0.0, + "eval_loss": 0.024749755859375, + "eval_runtime": 134.1841, + "eval_samples_per_second": 372.622, + "eval_steps_per_second": 23.289, + "step": 4000000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4000100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4000200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4000300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4000400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4000500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4000600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4000700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4000800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4000900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4001000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4001100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4001200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4001300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4001400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4001500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4001600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0265, + "step": 4001700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4001800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4001900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4002000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4002100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4002200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4002300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4002400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4002500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4002600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4002700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4002800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4002900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4003000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4003100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4003200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4003300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4003400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4003500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4003600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4003700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4003800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4003900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4004000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4004100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4004200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4004300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4004400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4004500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4004600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4004700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4004800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4004900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4005000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4005100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4005200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4005300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4005400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4005500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4005600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4005700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4005800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4005900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4006000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4006100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4006200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4006300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4006400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4006500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4006600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4006700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4006800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 4006900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4007000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4007100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4007200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4007300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4007400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4007500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4007600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4007700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0235, + "step": 4007800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0234, + "step": 4007900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4008000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4008100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4008200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4008300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4008400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4008500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4008600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4008700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4008800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4008900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4009000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4009100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4009200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4009300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4009400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4009500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4009600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4009700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4009800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4009900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4010000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4010100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4010200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4010300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4010400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4010500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4010600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4010700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4010800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4010900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4011000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4011100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4011200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4011300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4011400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4011500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4011600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4011700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4011800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4011900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4012000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4012100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4012200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4012300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4012400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4012500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4012600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4012700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4012800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4012900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4013000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4013100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4013200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4013300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4013400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4013500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4013600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4013700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4013800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4013900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4014000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4014100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4014200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4014300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4014400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4014500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4014600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4014700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4014800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4014900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4015000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4015100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4015200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4015300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4015400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4015500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4015600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4015700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4015800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4015900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4016000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4016100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4016200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4016300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4016400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4016500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4016600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4016700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4016800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4016900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4017000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4017100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4017200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4017300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4017400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4017500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4017600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4017700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4017800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4017900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4018000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4018100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4018200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4018300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4018400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4018500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4018600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4018700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4018800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4018900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4019000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4019100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4019200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4019300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4019400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4019500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4019600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4019700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4019800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4019900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4020000 + }, + { + "epoch": 0.0, + "eval_loss": 0.024749755859375, + "eval_runtime": 136.2543, + "eval_samples_per_second": 366.961, + "eval_steps_per_second": 22.935, + "step": 4020000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4020100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4020200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4020300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0273, + "step": 4020400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4020500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4020600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4020700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4020800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4020900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4021000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4021100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4021200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0234, + "step": 4021300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4021400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4021500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4021600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4021700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4021800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4021900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4022000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4022100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4022200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0272, + "step": 4022300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4022400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 4022500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4022600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4022700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4022800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4022900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4023000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4023100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4023200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4023300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4023400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4023500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4023600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4023700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4023800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4023900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4024000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4024100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4024200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4024300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4024400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4024500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4024600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4024700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4024800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4024900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4025000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4025100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4025200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0235, + "step": 4025300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4025400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4025500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4025600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4025700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4025800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0267, + "step": 4025900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4026000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4026100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4026200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4026300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4026400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4026500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 4026600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4026700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4026800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4026900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4027000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4027100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4027200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4027300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4027400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4027500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4027600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4027700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4027800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4027900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4028000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4028100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 4028200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4028300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4028400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4028500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4028600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4028700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4028800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4028900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4029000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4029100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4029200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4029300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4029400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4029500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4029600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4029700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4029800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4029900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4030000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4030100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4030200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4030300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4030400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4030500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4030600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4030700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4030800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4030900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4031000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4031100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4031200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4031300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0271, + "step": 4031400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4031500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4031600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4031700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4031800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4031900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4032000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4032100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4032200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4032300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4032400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4032500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 4032600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 4032700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4032800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4032900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4033000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0267, + "step": 4033100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4033200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4033300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4033400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4033500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4033600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4033700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4033800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4033900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 4034000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4034100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4034200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4034300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4034400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4034500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4034600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4034700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4034800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4034900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4035000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4035100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4035200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4035300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4035400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4035500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4035600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4035700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4035800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4035900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4036000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4036100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4036200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4036300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4036400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4036500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4036600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4036700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4036800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4036900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4037000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4037100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4037200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4037300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4037400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4037500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4037600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4037700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4037800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4037900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4038000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4038100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4038200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4038300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4038400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4038500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4038600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4038700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4038800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4038900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4039000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4039100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4039200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0269, + "step": 4039300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4039400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 4039500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4039600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4039700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4039800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4039900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4040000 + }, + { + "epoch": 0.0, + "eval_loss": 0.024749755859375, + "eval_runtime": 136.601, + "eval_samples_per_second": 366.029, + "eval_steps_per_second": 22.877, + "step": 4040000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4040100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4040200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4040300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4040400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4040500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4040600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4040700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4040800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4040900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4041000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4041100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4041200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4041300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4041400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4041500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4041600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4041700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4041800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4041900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4042000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 4042100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0232, + "step": 4042200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4042300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4042400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4042500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4042600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4042700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4042800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4042900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4043000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4043100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4043200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4043300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 4043400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4043500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4043600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4043700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4043800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4043900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4044000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4044100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4044200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4044300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4044400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4044500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4044600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4044700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4044800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4044900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4045000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4045100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 4045200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4045300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4045400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4045500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4045600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4045700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4045800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4045900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4046000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4046100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4046200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4046300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4046400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4046500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4046600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4046700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4046800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4046900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4047000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4047100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4047200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4047300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4047400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4047500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4047600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4047700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4047800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4047900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4048000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4048100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4048200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4048300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4048400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 4048500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4048600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4048700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4048800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4048900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4049000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4049100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4049200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4049300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4049400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4049500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4049600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4049700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4049800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4049900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4050000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4050100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4050200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4050300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4050400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4050500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4050600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4050700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4050800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4050900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4051000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4051100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 4051200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4051300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4051400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4051500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4051600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 4051700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4051800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4051900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4052000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4052100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4052200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4052300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4052400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4052500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4052600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4052700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4052800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4052900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4053000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4053100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4053200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4053300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4053400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4053500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4053600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4053700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4053800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4053900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4054000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4054100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4054200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4054300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4054400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4054500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4054600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4054700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4054800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4054900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4055000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0276, + "step": 4055100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4055200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4055300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4055400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4055500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4055600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4055700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4055800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4055900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4056000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4056100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4056200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4056300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4056400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4056500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4056600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0234, + "step": 4056700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4056800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4056900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4057000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4057100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4057200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4057300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4057400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4057500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4057600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4057700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4057800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4057900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4058000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4058100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4058200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4058300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4058400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4058500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0236, + "step": 4058600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4058700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4058800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4058900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4059000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4059100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4059200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4059300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4059400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4059500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4059600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4059700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4059800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4059900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4060000 + }, + { + "epoch": 0.0, + "eval_loss": 0.024749755859375, + "eval_runtime": 140.5228, + "eval_samples_per_second": 355.814, + "eval_steps_per_second": 22.238, + "step": 4060000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4060100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4060200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4060300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4060400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4060500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4060600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4060700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4060800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4060900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4061000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4061100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4061200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4061300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4061400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4061500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4061600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4061700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4061800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4061900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4062000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4062100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4062200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4062300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4062400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4062500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4062600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4062700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4062800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4062900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4063000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4063100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4063200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4063300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4063400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4063500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4063600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4063700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4063800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4063900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4064000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4064100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4064200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4064300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4064400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4064500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4064600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4064700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4064800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4064900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4065000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4065100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4065200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4065300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4065400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4065500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4065600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4065700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4065800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4065900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4066000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4066100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4066200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4066300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4066400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4066500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4066600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4066700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4066800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4066900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4067000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4067100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4067200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4067300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4067400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4067500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4067600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4067700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4067800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4067900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4068000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4068100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4068200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4068300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4068400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4068500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 4068600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4068700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4068800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4068900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4069000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4069100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 4069200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4069300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4069400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4069500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4069600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4069700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4069800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4069900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4070000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4070100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4070200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4070300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0236, + "step": 4070400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4070500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4070600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4070700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4070800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4070900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4071000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4071100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4071200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4071300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4071400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4071500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4071600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4071700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4071800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4071900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 4072000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4072100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0264, + "step": 4072200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4072300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4072400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4072500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4072600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4072700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4072800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4072900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4073000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4073100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4073200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4073300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4073400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4073500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4073600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4073700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4073800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4073900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4074000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4074100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4074200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4074300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4074400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4074500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4074600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4074700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4074800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4074900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4075000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4075100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4075200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4075300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4075400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4075500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4075600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4075700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4075800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4075900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4076000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4076100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4076200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4076300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4076400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4076500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4076600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4076700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4076800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4076900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4077000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4077100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4077200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4077300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4077400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4077500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4077600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4077700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4077800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4077900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4078000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4078100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4078200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4078300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4078400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4078500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4078600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4078700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0266, + "step": 4078800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4078900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 4079000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4079100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4079200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4079300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4079400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4079500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4079600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4079700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4079800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4079900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4080000 + }, + { + "epoch": 0.0, + "eval_loss": 0.024749755859375, + "eval_runtime": 137.9953, + "eval_samples_per_second": 362.331, + "eval_steps_per_second": 22.646, + "step": 4080000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4080100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4080200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4080300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4080400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4080500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4080600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4080700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4080800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4080900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4081000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4081100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4081200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4081300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4081400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4081500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4081600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4081700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4081800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4081900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4082000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4082100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4082200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4082300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4082400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4082500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4082600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4082700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4082800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4082900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4083000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4083100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4083200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4083300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4083400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4083500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4083600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4083700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4083800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4083900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4084000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4084100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4084200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4084300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4084400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4084500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4084600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4084700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4084800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4084900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4085000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4085100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4085200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4085300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4085400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4085500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4085600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4085700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4085800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4085900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4086000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4086100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4086200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4086300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4086400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4086500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4086600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4086700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4086800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4086900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4087000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4087100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4087200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4087300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 4087400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4087500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4087600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4087700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4087800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4087900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4088000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4088100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4088200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4088300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4088400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4088500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4088600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4088700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4088800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4088900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4089000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4089100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4089200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4089300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 4089400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4089500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4089600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4089700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4089800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4089900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4090000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4090100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4090200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4090300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4090400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4090500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4090600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4090700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4090800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4090900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0231, + "step": 4091000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4091100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4091200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4091300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4091400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4091500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4091600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4091700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4091800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4091900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4092000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4092100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4092200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4092300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4092400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0263, + "step": 4092500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4092600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4092700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4092800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4092900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4093000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4093100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4093200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4093300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4093400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4093500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4093600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4093700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4093800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4093900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4094000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4094100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4094200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4094300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4094400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4094500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4094600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4094700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4094800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4094900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4095000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4095100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4095200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4095300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4095400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4095500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4095600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0235, + "step": 4095700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4095800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4095900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4096000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4096100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4096200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4096300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4096400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4096500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4096600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4096700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4096800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4096900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4097000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4097100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4097200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4097300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4097400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4097500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4097600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4097700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4097800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4097900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4098000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4098100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 4098200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4098300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4098400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4098500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4098600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4098700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4098800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4098900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4099000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4099100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4099200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4099300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4099400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4099500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4099600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4099700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4099800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4099900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4100000 + }, + { + "epoch": 0.0, + "eval_loss": 0.0247650146484375, + "eval_runtime": 143.0486, + "eval_samples_per_second": 349.532, + "eval_steps_per_second": 21.846, + "step": 4100000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4100100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4100200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4100300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4100400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4100500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4100600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4100700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4100800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4100900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4101000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4101100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4101200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4101300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4101400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4101500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4101600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4101700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4101800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4101900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4102000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4102100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0238, + "step": 4102200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4102300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4102400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4102500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4102600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0266, + "step": 4102700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4102800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4102900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4103000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4103100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4103200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4103300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4103400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4103500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4103600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4103700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4103800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4103900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4104000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4104100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4104200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0236, + "step": 4104300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4104400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4104500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4104600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4104700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4104800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4104900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4105000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4105100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4105200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4105300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4105400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4105500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4105600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4105700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4105800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4105900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 4106000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4106100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4106200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4106300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4106400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4106500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4106600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4106700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4106800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0262, + "step": 4106900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4107000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4107100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4107200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4107300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4107400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4107500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4107600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0267, + "step": 4107700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4107800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4107900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4108000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4108100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4108200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4108300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4108400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4108500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4108600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4108700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4108800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4108900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4109000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4109100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4109200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4109300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4109400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4109500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4109600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4109700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4109800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4109900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4110000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4110100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4110200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4110300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0239, + "step": 4110400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4110500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4110600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4110700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4110800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.026, + "step": 4110900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4111000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4111100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4111200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4111300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4111400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4111500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0261, + "step": 4111600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4111700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4111800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4111900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4112000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4112100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4112200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4112300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4112400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4112500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4112600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4112700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4112800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4112900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4113000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4113100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4113200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4113300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4113400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4113500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4113600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4113700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4113800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4113900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4114000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4114100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4114200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4114300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4114400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4114500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4114600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4114700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0232, + "step": 4114800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4114900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4115000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4115100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0271, + "step": 4115200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4115300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4115400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0258, + "step": 4115500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0245, + "step": 4115600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4115700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4115800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4115900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0242, + "step": 4116000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4116100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4116200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4116300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4116400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4116500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4116600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0259, + "step": 4116700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0243, + "step": 4116800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4116900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0237, + "step": 4117000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0244, + "step": 4117100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4117200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0256, + "step": 4117300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0253, + "step": 4117400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4117500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4117600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4117700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4117800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4117900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4118000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4118100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4118200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4118300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4118400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0257, + "step": 4118500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4118600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0247, + "step": 4118700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4118800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4118900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4119000 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0246, + "step": 4119100 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0241, + "step": 4119200 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0249, + "step": 4119300 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0254, + "step": 4119400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0255, + "step": 4119500 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.025, + "step": 4119600 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0248, + "step": 4119700 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.024, + "step": 4119800 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0251, + "step": 4119900 + }, + { + "epoch": 0.0, + "learning_rate": 5e-07, + "loss": 0.0252, + "step": 4120000 + }, + { + "epoch": 0.0, + "eval_loss": 0.024749755859375, + "eval_runtime": 142.3614, + "eval_samples_per_second": 351.219, + "eval_steps_per_second": 21.951, + "step": 4120000 + } + ], + "logging_steps": 100, + "max_steps": 100000000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 20000, + "total_flos": 9.902118862348878e+18, + "train_batch_size": null, + "trial_name": null, + "trial_params": null +}